@novastera-oss/llamarn 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +12 -8
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +46 -65
  13. package/cpp/LlamaCppModel.h +5 -0
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/README.md +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +5 -8
  17. package/cpp/llama.cpp/common/arg.cpp +8 -6
  18. package/cpp/llama.cpp/common/chat-parser.cpp +4 -3
  19. package/cpp/llama.cpp/common/chat-parser.h +2 -1
  20. package/cpp/llama.cpp/common/chat.cpp +4 -4
  21. package/cpp/llama.cpp/common/common.cpp +2 -0
  22. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  23. package/cpp/llama.cpp/common/json-partial.h +2 -1
  24. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  25. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +31 -28
  27. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  28. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +2 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  30. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +23 -0
  32. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +1 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +19 -8
  35. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  39. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml.c +9 -2
  41. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  42. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  43. package/cpp/llama.cpp/include/llama.h +12 -8
  44. package/cpp/llama.cpp/src/CMakeLists.txt +3 -0
  45. package/cpp/llama.cpp/src/llama-batch.cpp +19 -12
  46. package/cpp/llama.cpp/src/llama-batch.h +15 -10
  47. package/cpp/llama.cpp/src/llama-context.cpp +226 -151
  48. package/cpp/llama.cpp/src/llama-context.h +25 -8
  49. package/cpp/llama.cpp/src/llama-graph.cpp +50 -47
  50. package/cpp/llama.cpp/src/llama-graph.h +25 -24
  51. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.cpp +1132 -0
  52. package/cpp/llama.cpp/src/llama-kv-cache-recurrent.h +191 -0
  53. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +249 -0
  54. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +136 -0
  55. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1717 -0
  56. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +278 -0
  57. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2746
  58. package/cpp/llama.cpp/src/llama-kv-cache.h +14 -472
  59. package/cpp/llama.cpp/src/llama-kv-cells.h +37 -6
  60. package/cpp/llama.cpp/src/llama-memory.h +44 -0
  61. package/cpp/llama.cpp/src/llama-model.cpp +23 -16
  62. package/cpp/llama.cpp/src/llama-vocab.cpp +7 -2
  63. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  64. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  65. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  66. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  67. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  68. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  69. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  70. package/cpp/rn-completion.cpp +101 -52
  71. package/cpp/rn-utils.hpp +8 -1
  72. package/ios/include/common/minja/chat-template.hpp +1 -1
  73. package/ios/include/common/minja/minja.hpp +1 -1
  74. package/ios/include/json-schema-to-grammar.h +4 -4
  75. package/ios/include/llama.h +12 -8
  76. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  77. package/ios/libs/llama.xcframework/Info.plist +22 -22
  78. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  79. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4617
  80. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  81. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +12 -8
  82. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  83. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  84. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  85. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3557
  86. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  87. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  88. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  89. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  90. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4638
  91. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3624 -3559
  92. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  93. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +12 -8
  94. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  95. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +12 -8
  96. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  97. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  98. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +12 -8
  99. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  100. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  101. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  102. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4689 -4616
  103. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  104. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +12 -8
  105. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  106. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  107. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4710 -4637
  108. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3622 -3556
  109. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  110. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  111. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  112. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  113. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4725 -4653
  114. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  115. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +12 -8
  116. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  117. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  118. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4746 -4674
  119. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3652 -3587
  120. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  121. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +12 -8
  122. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  123. package/package.json +1 -1
@@ -39,7 +39,8 @@ Pod::Spec.new do |s|
39
39
  "cpp/llama.cpp/common/speculative.{h,cpp}",
40
40
  "cpp/llama.cpp/common/llguidance.{h,cpp}",
41
41
  "cpp/llama.cpp/common/*.hpp",
42
- "cpp/llama.cpp/common/minja/*.hpp"
42
+ "cpp/llama.cpp/vendor/minja/*.hpp"
43
+ "cpp/llama.cpp/vendor/nlohmann/*.hpp"
43
44
 
44
45
  # Include all necessary headers for compilation
45
46
  s.preserve_paths = "ios/include/**/*.h",
@@ -51,7 +52,7 @@ Pod::Spec.new do |s|
51
52
 
52
53
  # Compiler settings
53
54
  s.pod_target_xcconfig = {
54
- "HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
55
+ "HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/vendor\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
55
56
  "OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
56
57
  "CLANG_CXX_LANGUAGE_STANDARD" => "c++17",
57
58
  "GCC_OPTIMIZATION_LEVEL" => "3", # Maximum optimization
@@ -141,7 +141,8 @@ target_include_directories(common PRIVATE
141
141
  ${LLAMA_CPP_DIR}/ggml/include
142
142
  ${LLAMA_CPP_DIR}/include
143
143
  ${LLAMA_CPP_DIR}/common
144
- ${LLAMA_CPP_DIR}/common/minja # Add this for chat-template.hpp
144
+ ${LLAMA_CPP_DIR}/vendor/minja
145
+ ${LLAMA_CPP_DIR}/vendor
145
146
  ${LLAMA_CPP_DIR}/src
146
147
  )
147
148
 
@@ -150,7 +151,8 @@ target_include_directories(RNLlamaCpp PRIVATE
150
151
  ${LLAMA_CPP_DIR}/ggml/include
151
152
  ${LLAMA_CPP_DIR}/include
152
153
  ${LLAMA_CPP_DIR}/common
153
- ${LLAMA_CPP_DIR}/common/minja # Add this for chat-template.hpp
154
+ ${LLAMA_CPP_DIR}/vendor/minja # Add this for chat-template.hpp
155
+ ${LLAMA_CPP_DIR}/vendor
154
156
  ${LLAMA_CPP_DIR}/src
155
157
  # Add the generated headers path
156
158
  ${MODULE_ROOT}/android/generated/jni
@@ -244,6 +246,7 @@ target_include_directories(RNLlamaCpp INTERFACE
244
246
  ${LLAMA_CPP_DIR}/ggml/include
245
247
  ${LLAMA_CPP_DIR}/include
246
248
  ${LLAMA_CPP_DIR}/common
247
- ${LLAMA_CPP_DIR}/common/minja
249
+ ${LLAMA_CPP_DIR}/vendor/minja
250
+ ${LLAMA_CPP_DIR}/vendor
248
251
  ${LLAMA_CPP_DIR}/src
249
252
  )
@@ -259,9 +259,9 @@ extern "C" {
259
259
  llama_token * token;
260
260
  float * embd;
261
261
  llama_pos * pos;
262
- int32_t * n_seq_id;
263
- llama_seq_id ** seq_id;
264
- int8_t * logits; // TODO: rename this to "output"
262
+ int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
+ llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
264
+ int8_t * logits; // TODO: rename this to "output"
265
265
  } llama_batch;
266
266
 
267
267
  enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
366
366
  bool no_perf; // measure performance timings
367
367
  bool op_offload; // offload host tensor operations to device
368
368
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
369
+ // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
370
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
369
371
  };
370
372
 
371
373
  // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
502
504
  LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
503
505
  LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
504
506
  LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
507
+ LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
505
508
 
506
509
  // Get the model's RoPE frequency scaling factor
507
510
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
652
655
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
653
656
  // If the KV cache is RoPEd, the KV data is updated accordingly:
654
657
  // - lazily on next llama_decode()
655
- // - explicitly with llama_kv_self_update()
656
658
  // p0 < 0 : [0, p1]
657
659
  // p1 < 0 : [p0, inf)
658
660
  LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
665
667
  // Integer division of the positions by factor of `d > 1`
666
668
  // If the KV cache is RoPEd, the KV data is updated accordingly:
667
669
  // - lazily on next llama_decode()
668
- // - explicitly with llama_kv_self_update()
669
670
  // p0 < 0 : [0, p1]
670
671
  // p1 < 0 : [p0, inf)
671
672
  LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {
677
678
 
678
679
  // Returns the smallest position present in the KV cache for the specified sequence
679
680
  // This is typically non-zero only for SWA caches
681
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
680
682
  // Return -1 if the sequence is empty
681
683
  LLAMA_API llama_pos llama_kv_self_seq_pos_min(
682
684
  struct llama_context * ctx,
683
685
  llama_seq_id seq_id);
684
686
 
685
687
  // Returns the largest position present in the KV cache for the specified sequence
688
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
686
689
  // Return -1 if the sequence is empty
687
690
  LLAMA_API llama_pos llama_kv_self_seq_pos_max(
688
691
  struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
691
694
  // Defragment the KV cache
692
695
  // This will be applied:
693
696
  // - lazily on next llama_decode()
694
- // - explicitly with llama_kv_self_update()
695
- LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
697
+ LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
698
+ "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
696
699
 
697
700
  // Check if the context supports KV cache shifting
698
701
  LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
699
702
 
700
703
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
701
- LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
704
+ LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
705
+ "simply remove this call, updates are applied lazily on the next llama_decode()");
702
706
 
703
707
  //
704
708
  // State / sessions
@@ -242,38 +242,12 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
242
242
  auto paramsVal = fnObj.getProperty(rt, "parameters");
243
243
  if (paramsVal.isObject()) {
244
244
  try {
245
- // Convert the JSI object directly to nlohmann::json
246
- auto paramsObj = paramsVal.getObject(rt);
247
- json fnParams = json::object();
248
-
249
- // Extract properties directly from the JSI object
250
- jsi::Array propNames = paramsObj.getPropertyNames(rt);
251
- size_t propCount = propNames.size(rt);
252
- for (size_t i = 0; i < propCount; i++) {
253
- jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
254
- std::string key = propName.utf8(rt);
255
- auto value = paramsObj.getProperty(rt, propName);
256
-
257
- if (value.isString()) {
258
- fnParams[key] = value.asString(rt).utf8(rt);
259
- } else if (value.isNumber()) {
260
- fnParams[key] = value.asNumber();
261
- } else if (value.isBool()) {
262
- fnParams[key] = value.getBool();
263
- } else if (value.isNull()) {
264
- fnParams[key] = nullptr;
265
- } else if (value.isObject()) {
266
- if (value.getObject(rt).isArray(rt)) {
267
- fnParams[key] = json::array();
268
- } else {
269
- fnParams[key] = json::object();
270
- }
271
- }
272
- }
273
-
274
- fnJson["parameters"] = fnParams;
275
- } catch (const std::exception&) {
276
- fnJson["parameters"] = json::object();
245
+ // Convert the JSI object directly to nlohmann::json using the new helper
246
+ fnJson["parameters"] = jsiValueToJson(rt, paramsVal);
247
+ } catch (const std::exception& e) {
248
+ // Log error or handle as appropriate
249
+ fprintf(stderr, "Failed to parse tool parameters: %s\n", e.what());
250
+ fnJson["parameters"] = json::object(); // Fallback to empty object
277
251
  }
278
252
  }
279
253
  }
@@ -336,39 +310,12 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
336
310
  auto paramsVal = fnObj.getProperty(rt, "parameters");
337
311
  if (paramsVal.isObject()) {
338
312
  try {
339
- // Convert the JSI object directly to nlohmann::json
340
- auto paramsObj = paramsVal.getObject(rt);
341
- json fnParams = json::object();
342
-
343
- // Extract properties directly from the JSI object
344
- jsi::Array propNames = paramsObj.getPropertyNames(rt);
345
- size_t propCount = propNames.size(rt);
346
- for (size_t i = 0; i < propCount; i++) {
347
- jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
348
- std::string key = propName.utf8(rt);
349
- auto value = paramsObj.getProperty(rt, propName);
350
-
351
- if (value.isString()) {
352
- fnParams[key] = value.asString(rt).utf8(rt);
353
- } else if (value.isNumber()) {
354
- fnParams[key] = value.asNumber();
355
- } else if (value.isBool()) {
356
- fnParams[key] = value.getBool();
357
- } else if (value.isNull()) {
358
- fnParams[key] = nullptr;
359
- } else if (value.isObject()) {
360
- // For nested objects, we use a simplified approach
361
- if (value.getObject(rt).isArray(rt)) {
362
- fnParams[key] = json::array();
363
- } else {
364
- fnParams[key] = json::object();
365
- }
366
- }
367
- }
368
-
369
- fnJson["parameters"] = fnParams;
370
- } catch (const std::exception&) {
371
- fnJson["parameters"] = json::object();
313
+ // Convert the JSI object directly to nlohmann::json using the new helper
314
+ fnJson["parameters"] = jsiValueToJson(rt, paramsVal);
315
+ } catch (const std::exception& e) {
316
+ // Log error or handle as appropriate
317
+ fprintf(stderr, "Failed to parse tool parameters: %s\n", e.what());
318
+ fnJson["parameters"] = json::object(); // Fallback to empty object
372
319
  }
373
320
  }
374
321
  }
@@ -553,6 +500,40 @@ jsi::Value LlamaCppModel::jsonToJsi(jsi::Runtime& rt, const json& j) {
553
500
  return jsi::Value::undefined();
554
501
  }
555
502
 
503
+ // Helper to convert JSI Value to nlohmann::json
504
+ json LlamaCppModel::jsiValueToJson(jsi::Runtime& rt, const jsi::Value& val) {
505
+ if (val.isUndefined() || val.isNull()) {
506
+ return nullptr;
507
+ } else if (val.isBool()) {
508
+ return val.getBool();
509
+ } else if (val.isNumber()) {
510
+ return val.getNumber();
511
+ } else if (val.isString()) {
512
+ return val.getString(rt).utf8(rt);
513
+ } else if (val.isObject()) {
514
+ jsi::Object jsiObj = val.getObject(rt);
515
+ if (jsiObj.isArray(rt)) {
516
+ jsi::Array jsiArr = jsiObj.getArray(rt);
517
+ json jsonArr = json::array();
518
+ for (size_t i = 0; i < jsiArr.size(rt); ++i) {
519
+ jsonArr.push_back(jsiValueToJson(rt, jsiArr.getValueAtIndex(rt, i)));
520
+ }
521
+ return jsonArr;
522
+ } else {
523
+ json jsonObj = json::object();
524
+ jsi::Array propNames = jsiObj.getPropertyNames(rt);
525
+ for (size_t i = 0; i < propNames.size(rt); ++i) {
526
+ jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
527
+ std::string key = propName.utf8(rt);
528
+ jsonObj[key] = jsiValueToJson(rt, jsiObj.getProperty(rt, propName));
529
+ }
530
+ return jsonObj;
531
+ }
532
+ }
533
+ // Should not happen for valid JSON-like structures
534
+ return nullptr;
535
+ }
536
+
556
537
  // JSI method for completions (synchronous - kept for compatibility)
557
538
  jsi::Value LlamaCppModel::completionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
558
539
  if (count < 1 || !args[0].isObject()) {
@@ -21,6 +21,9 @@
21
21
  #include "rn-utils.hpp"
22
22
  #include "rn-llama.hpp"
23
23
 
24
+ // Include json.hpp for json handling
25
+ #include "nlohmann/json.hpp"
26
+
24
27
  namespace facebook::react {
25
28
 
26
29
  // Chat message structure for representing messages in a conversation
@@ -166,6 +169,8 @@ private:
166
169
 
167
170
  // Add CallInvoker for async operations
168
171
  std::shared_ptr<CallInvoker> jsInvoker_;
172
+
173
+ static json jsiValueToJson(jsi::Runtime& rt, const jsi::Value& val); // Declaration of new helper
169
174
  };
170
175
 
171
176
  } // namespace facebook::react
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5541;
2
- char const *LLAMA_COMMIT = "07e4351c";
1
+ int LLAMA_BUILD_NUMBER = 5572;
2
+ char const *LLAMA_COMMIT = "7675c555";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
130
130
  <details>
131
131
  <summary>Bindings</summary>
132
132
 
133
+ - Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
133
134
  - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
134
135
  - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
135
136
  - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -58,23 +58,20 @@ add_library(${TARGET} STATIC
58
58
  arg.cpp
59
59
  arg.h
60
60
  base64.hpp
61
- chat.cpp
62
- chat.h
63
61
  chat-parser.cpp
64
62
  chat-parser.h
63
+ chat.cpp
64
+ chat.h
65
65
  common.cpp
66
66
  common.h
67
67
  console.cpp
68
68
  console.h
69
- json-schema-to-grammar.cpp
70
- json.hpp
71
- json-partial.h
72
69
  json-partial.cpp
70
+ json-partial.h
71
+ json-schema-to-grammar.cpp
73
72
  llguidance.cpp
74
73
  log.cpp
75
74
  log.h
76
- minja/chat-template.hpp
77
- minja/minja.hpp
78
75
  ngram-cache.cpp
79
76
  ngram-cache.h
80
77
  regex-partial.cpp
@@ -147,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
147
144
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
148
145
  endif ()
149
146
 
150
- target_include_directories(${TARGET} PUBLIC .)
147
+ target_include_directories(${TARGET} PUBLIC . ../vendor)
151
148
  target_compile_features (${TARGET} PUBLIC cxx_std_17)
152
149
  target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
153
150
 
@@ -1,10 +1,11 @@
1
- #include "gguf.h" // for reading GGUF splits
2
1
  #include "arg.h"
3
2
 
3
+ #include "chat.h"
4
4
  #include "common.h"
5
+ #include "gguf.h" // for reading GGUF splits
6
+ #include "json-schema-to-grammar.h"
5
7
  #include "log.h"
6
8
  #include "sampling.h"
7
- #include "chat.h"
8
9
 
9
10
  // fix problem with std::min and std::max
10
11
  #if defined(_WIN32)
@@ -15,6 +16,9 @@
15
16
  #include <windows.h>
16
17
  #endif
17
18
 
19
+ #define JSON_ASSERT GGML_ASSERT
20
+ #include <nlohmann/json.hpp>
21
+
18
22
  #include <algorithm>
19
23
  #include <climits>
20
24
  #include <cstdarg>
@@ -34,8 +38,6 @@
34
38
  #include <future>
35
39
  #endif
36
40
 
37
- #include "json-schema-to-grammar.h"
38
-
39
41
  using json = nlohmann::ordered_json;
40
42
 
41
43
  std::initializer_list<enum llama_example> mmproj_examples = {
@@ -1346,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1346
1348
  ));
1347
1349
  add_opt(common_arg(
1348
1350
  {"--prio"}, "N",
1349
- string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1351
+ string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
1350
1352
  [](common_params & params, int prio) {
1351
- if (prio < 0 || prio > 3) {
1353
+ if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
1352
1354
  throw std::invalid_argument("invalid value");
1353
1355
  }
1354
1356
  params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
154
154
  if (!rest.empty()) {
155
155
  handle_reasoning(rest, /* closed */ !is_partial());
156
156
  }
157
- if (!syntax_.thinking_forced_open) {
158
- throw common_chat_msg_partial_exception(end_think);
159
- }
157
+ // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
158
+ // if (!syntax_.thinking_forced_open) {
159
+ // throw common_chat_msg_partial_exception(end_think);
160
+ // }
160
161
  return true;
161
162
  }
162
163
  }
@@ -2,9 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "json-partial.h"
5
- #include "json.hpp"
6
5
  #include "regex-partial.h"
7
6
 
7
+ #include <nlohmann/json.hpp>
8
+
8
9
  #include <optional>
9
10
  #include <string>
10
11
  #include <vector>
@@ -1,13 +1,14 @@
1
1
  #include "chat.h"
2
2
  #include "chat-parser.h"
3
3
  #include "common.h"
4
+ #include "json-partial.h"
4
5
  #include "json-schema-to-grammar.h"
5
6
  #include "log.h"
6
- #include "json-partial.h"
7
- #include "minja/chat-template.hpp"
8
- #include "minja/minja.hpp"
9
7
  #include "regex-partial.h"
10
8
 
9
+ #include <minja/chat-template.hpp>
10
+ #include <minja/minja.hpp>
11
+
11
12
  #include <cstdio>
12
13
  #include <exception>
13
14
  #include <iostream>
@@ -16,7 +17,6 @@
16
17
  #include <string>
17
18
  #include <vector>
18
19
 
19
-
20
20
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
21
21
  auto time = std::chrono::system_clock::to_time_t(now);
22
22
  auto local_time = *std::localtime(&time);
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203
203
 
204
204
  DWORD p = NORMAL_PRIORITY_CLASS;
205
205
  switch (prio) {
206
+ case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206
207
  case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207
208
  case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208
209
  case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228
229
 
229
230
  int p = 0;
230
231
  switch (prio) {
232
+ case GGML_SCHED_PRIO_LOW: p = 5; break;
231
233
  case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232
234
  case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233
235
  case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -1,9 +1,10 @@
1
- #include <json-partial.h>
2
- #include "ggml.h"
1
+ #include "json-partial.h"
2
+
3
3
  #include "log.h"
4
- #include <string>
5
4
 
6
- #include <json.hpp>
5
+ #include <nlohmann/json.hpp>
6
+
7
+ #include <string>
7
8
 
8
9
  using json = nlohmann::ordered_json;
9
10
 
@@ -1,5 +1,6 @@
1
1
  #pragma once
2
- #include <json.hpp>
2
+
3
+ #include <nlohmann/json.hpp>
3
4
 
4
5
  // Healing marker (empty if the JSON was fully parsed / wasn't healed).
5
6
  struct common_healing_marker {
@@ -1,8 +1,9 @@
1
1
  #include "json-schema-to-grammar.h"
2
2
  #include "common.h"
3
3
 
4
+ #include <nlohmann/json.hpp>
5
+
4
6
  #include <algorithm>
5
- #include <fstream>
6
7
  #include <map>
7
8
  #include <regex>
8
9
  #include <sstream>
@@ -1,9 +1,9 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
4
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
5
- #define JSON_ASSERT GGML_ASSERT
6
- #include "json.hpp"
3
+ #include <nlohmann/json_fwd.hpp>
4
+
5
+ #include <functional>
6
+ #include <string>
7
7
 
8
8
  std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
9
9
  bool force_gbnf = false);
@@ -1047,6 +1047,10 @@ class TextModel(ModelBase):
1047
1047
  special_vocab.chat_template = "rwkv-world"
1048
1048
  # hack: Add '\n\n' as the EOT token to make it chat normally
1049
1049
  special_vocab._set_special_token("eot", 261)
1050
+ # hack: Override these as they have already been set (incorrectly)
1051
+ special_vocab.special_token_ids["bos"] = 0
1052
+ special_vocab.special_token_ids["eos"] = 0
1053
+
1050
1054
  special_vocab.add_to_gguf(self.gguf_writer)
1051
1055
 
1052
1056
  def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -3810,7 +3814,7 @@ class BertModel(TextModel):
3810
3814
  remove_whitespaces = tokenizer.clean_up_tokenization_spaces
3811
3815
  precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
3812
3816
 
3813
- vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3817
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
3814
3818
  else:
3815
3819
  sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3816
3820
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3823,7 +3827,7 @@ class BertModel(TextModel):
3823
3827
  tokenizer = SentencePieceProcessor()
3824
3828
  tokenizer.LoadFromFile(str(tokenizer_path))
3825
3829
 
3826
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3830
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
3827
3831
 
3828
3832
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3829
3833
  scores: list[float] = [-10000.0] * vocab_size
@@ -3853,33 +3857,26 @@ class BertModel(TextModel):
3853
3857
  unk_token = tokenizer_config_json.get("unk_token")
3854
3858
  unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
3855
3859
 
3856
- for token_id in range(vocab_size):
3860
+ for token_id in range(tokenizer.vocab_size):
3857
3861
  piece = tokenizer._convert_id_to_token(token_id)
3858
- text = piece.encode("utf-8")
3859
- score = tokenizer_json["model"]["vocab"][token_id][1]
3860
-
3861
- toktype = SentencePieceTokenTypes.NORMAL
3862
- if token_id == unk_token_id:
3863
- toktype = SentencePieceTokenTypes.UNKNOWN
3864
- elif token_id in tokenizer.all_special_ids:
3865
- toktype = SentencePieceTokenTypes.CONTROL
3866
- elif token_id in added_vocab.values():
3867
- toktype = SentencePieceTokenTypes.USER_DEFINED
3868
- # No reliable way to detect this, but jina doesn't have any
3869
- # elif tokenizer.IsByte(token_id):
3870
- # toktype = SentencePieceTokenTypes.BYTE
3871
-
3872
- tokens[token_id] = text
3873
- scores[token_id] = score
3874
- toktypes[token_id] = toktype
3875
-
3876
- if vocab_size > len(tokens):
3877
- pad_count = vocab_size - len(tokens)
3878
- logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3879
- for i in range(1, pad_count + 1):
3880
- tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3881
- scores.append(-1000.0)
3882
- toktypes.append(SentencePieceTokenTypes.UNUSED)
3862
+ if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3863
+ text = piece.encode("utf-8")
3864
+ score = tokenizer_json["model"]["vocab"][token_id][1]
3865
+
3866
+ toktype = SentencePieceTokenTypes.NORMAL
3867
+ if token_id == unk_token_id:
3868
+ toktype = SentencePieceTokenTypes.UNKNOWN
3869
+ elif token_id in tokenizer.all_special_ids:
3870
+ toktype = SentencePieceTokenTypes.CONTROL
3871
+ elif token_id in added_vocab.values():
3872
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3873
+ # No reliable way to detect this, but jina doesn't have any
3874
+ # elif tokenizer.IsByte(token_id):
3875
+ # toktype = SentencePieceTokenTypes.BYTE
3876
+
3877
+ tokens[token_id] = text
3878
+ scores[token_id] = score
3879
+ toktypes[token_id] = toktype
3883
3880
 
3884
3881
  if isinstance(tokenizer, SentencePieceProcessor):
3885
3882
  # realign tokens (see HF tokenizer code)
@@ -3892,6 +3889,12 @@ class BertModel(TextModel):
3892
3889
  SentencePieceTokenTypes.UNKNOWN,
3893
3890
  ] + toktypes[3:-1]
3894
3891
 
3892
+ if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3893
+ # Add mask token missing from sentencepiece.bpe.model
3894
+ tokens[250001] = b'<mask>'
3895
+ scores[250001] = 0.0
3896
+ toktypes[250001] = SentencePieceTokenTypes.CONTROL
3897
+
3895
3898
  self.gguf_writer.add_tokenizer_model("t5")
3896
3899
  self.gguf_writer.add_tokenizer_pre("default")
3897
3900
  self.gguf_writer.add_token_list(tokens)
@@ -2095,9 +2095,6 @@ extern "C" {
2095
2095
  GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2096
2096
  GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2097
2097
 
2098
- GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
2099
- GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
2100
-
2101
2098
  // print info and performance information for the graph
2102
2099
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2103
2100
 
@@ -2181,6 +2178,7 @@ extern "C" {
2181
2178
 
2182
2179
  // scheduling priorities
2183
2180
  enum ggml_sched_priority {
2181
+ GGML_SCHED_PRIO_LOW = -1,
2184
2182
  GGML_SCHED_PRIO_NORMAL,
2185
2183
  GGML_SCHED_PRIO_MEDIUM,
2186
2184
  GGML_SCHED_PRIO_HIGH,