npm - @novastera-oss/llamarn - Versions diffs - 0.2.4 → 0.2.6 - Mend

@novastera-oss/llamarn 0.2.4 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

package/RNLlamaCpp.podspec CHANGED Viewed

@@ -39,7 +39,8 @@ Pod::Spec.new do |s|
                    "cpp/llama.cpp/common/speculative.{h,cpp}",
                    "cpp/llama.cpp/common/llguidance.{h,cpp}",
                    "cpp/llama.cpp/common/*.hpp",
-                   "cpp/llama.cpp/common/minja/*.hpp"
+                   "cpp/llama.cpp/vendor/minja/*.hpp"
+                   "cpp/llama.cpp/vendor/nlohmann/*.hpp"
   # Include all necessary headers for compilation
   s.preserve_paths = "ios/include/**/*.h",
@@ -51,7 +52,7 @@ Pod::Spec.new do |s|
   # Compiler settings
   s.pod_target_xcconfig = {
-    "HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
+    "HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/vendor\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
     "OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
     "CLANG_CXX_LANGUAGE_STANDARD" => "c++17",
     "GCC_OPTIMIZATION_LEVEL" => "3", # Maximum optimization

package/android/CMakeLists.txt CHANGED Viewed

@@ -141,7 +141,8 @@ target_include_directories(common PRIVATE
     ${LLAMA_CPP_DIR}/ggml/include
     ${LLAMA_CPP_DIR}/include
     ${LLAMA_CPP_DIR}/common
-    ${LLAMA_CPP_DIR}/common/minja  # Add this for chat-template.hpp
+    ${LLAMA_CPP_DIR}/vendor/minja
+    ${LLAMA_CPP_DIR}/vendor
     ${LLAMA_CPP_DIR}/src
 )
@@ -150,7 +151,8 @@ target_include_directories(RNLlamaCpp PRIVATE
     ${LLAMA_CPP_DIR}/ggml/include
     ${LLAMA_CPP_DIR}/include
     ${LLAMA_CPP_DIR}/common
-    ${LLAMA_CPP_DIR}/common/minja  # Add this for chat-template.hpp
+    ${LLAMA_CPP_DIR}/vendor/minja  # Add this for chat-template.hpp
+    ${LLAMA_CPP_DIR}/vendor
     ${LLAMA_CPP_DIR}/src
     # Add the generated headers path
     ${MODULE_ROOT}/android/generated/jni
@@ -244,6 +246,7 @@ target_include_directories(RNLlamaCpp INTERFACE
     ${LLAMA_CPP_DIR}/ggml/include
     ${LLAMA_CPP_DIR}/include
     ${LLAMA_CPP_DIR}/common
-    ${LLAMA_CPP_DIR}/common/minja
+    ${LLAMA_CPP_DIR}/vendor/minja
+    ${LLAMA_CPP_DIR}/vendor
     ${LLAMA_CPP_DIR}/src
 )

package/android/src/main/cpp/include/llama.h CHANGED Viewed

@@ -259,9 +259,9 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
         llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
+        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
     enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
         bool no_perf;     // measure performance timings
         bool op_offload;  // offload host tensor operations to device
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
     };
     // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
     // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
     // Check if the context supports KV cache shifting
     LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
     //
     // State / sessions

package/android/src/main/jniLibs/arm64-v8a/libggml-base.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libggml.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libllama.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml-base.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml-cpu.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libllama.so CHANGED Viewed

Binary file

package/cpp/LlamaCppModel.cpp CHANGED Viewed

@@ -242,38 +242,12 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
                       auto paramsVal = fnObj.getProperty(rt, "parameters");
                       if (paramsVal.isObject()) {
                         try {
-                          // Convert the JSI object directly to nlohmann::json
-                          auto paramsObj = paramsVal.getObject(rt);
-                          json fnParams = json::object();
-                          // Extract properties directly from the JSI object
-                          jsi::Array propNames = paramsObj.getPropertyNames(rt);
-                          size_t propCount = propNames.size(rt);
-                          for (size_t i = 0; i < propCount; i++) {
-                            jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
-                            std::string key = propName.utf8(rt);
-                            auto value = paramsObj.getProperty(rt, propName);
-                            if (value.isString()) {
-                              fnParams[key] = value.asString(rt).utf8(rt);
-                            } else if (value.isNumber()) {
-                              fnParams[key] = value.asNumber();
-                            } else if (value.isBool()) {
-                              fnParams[key] = value.getBool();
-                            } else if (value.isNull()) {
-                              fnParams[key] = nullptr;
-                            } else if (value.isObject()) {
-                              if (value.getObject(rt).isArray(rt)) {
-                                fnParams[key] = json::array();
-                              } else {
-                                fnParams[key] = json::object();
-                              }
-                            }
-                          }
-                          fnJson["parameters"] = fnParams;
-                        } catch (const std::exception&) {
-                          fnJson["parameters"] = json::object();
+                          // Convert the JSI object directly to nlohmann::json using the new helper
+                          fnJson["parameters"] = jsiValueToJson(rt, paramsVal);
+                        } catch (const std::exception& e) {
+                          // Log error or handle as appropriate
+                          fprintf(stderr, "Failed to parse tool parameters: %s\n", e.what());
+                          fnJson["parameters"] = json::object(); // Fallback to empty object
                         }
                       }
                     }
@@ -336,39 +310,12 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
               auto paramsVal = fnObj.getProperty(rt, "parameters");
               if (paramsVal.isObject()) {
                 try {
-                  // Convert the JSI object directly to nlohmann::json
-                  auto paramsObj = paramsVal.getObject(rt);
-                  json fnParams = json::object();
-                  // Extract properties directly from the JSI object
-                  jsi::Array propNames = paramsObj.getPropertyNames(rt);
-                  size_t propCount = propNames.size(rt);
-                  for (size_t i = 0; i < propCount; i++) {
-                    jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
-                    std::string key = propName.utf8(rt);
-                    auto value = paramsObj.getProperty(rt, propName);
-                    if (value.isString()) {
-                      fnParams[key] = value.asString(rt).utf8(rt);
-                    } else if (value.isNumber()) {
-                      fnParams[key] = value.asNumber();
-                    } else if (value.isBool()) {
-                      fnParams[key] = value.getBool();
-                    } else if (value.isNull()) {
-                      fnParams[key] = nullptr;
-                    } else if (value.isObject()) {
-                      // For nested objects, we use a simplified approach
-                      if (value.getObject(rt).isArray(rt)) {
-                        fnParams[key] = json::array();
-                      } else {
-                        fnParams[key] = json::object();
-                      }
-                    }
-                  }
-                  fnJson["parameters"] = fnParams;
-                } catch (const std::exception&) {
-                  fnJson["parameters"] = json::object();
+                  // Convert the JSI object directly to nlohmann::json using the new helper
+                  fnJson["parameters"] = jsiValueToJson(rt, paramsVal);
+                } catch (const std::exception& e) {
+                  // Log error or handle as appropriate
+                  fprintf(stderr, "Failed to parse tool parameters: %s\n", e.what());
+                  fnJson["parameters"] = json::object(); // Fallback to empty object
                 }
               }
             }
@@ -553,6 +500,40 @@ jsi::Value LlamaCppModel::jsonToJsi(jsi::Runtime& rt, const json& j) {
   return jsi::Value::undefined();
 }
+// Helper to convert JSI Value to nlohmann::json
+json LlamaCppModel::jsiValueToJson(jsi::Runtime& rt, const jsi::Value& val) {
+    if (val.isUndefined() || val.isNull()) {
+        return nullptr;
+    } else if (val.isBool()) {
+        return val.getBool();
+    } else if (val.isNumber()) {
+        return val.getNumber();
+    } else if (val.isString()) {
+        return val.getString(rt).utf8(rt);
+    } else if (val.isObject()) {
+        jsi::Object jsiObj = val.getObject(rt);
+        if (jsiObj.isArray(rt)) {
+            jsi::Array jsiArr = jsiObj.getArray(rt);
+            json jsonArr = json::array();
+            for (size_t i = 0; i < jsiArr.size(rt); ++i) {
+                jsonArr.push_back(jsiValueToJson(rt, jsiArr.getValueAtIndex(rt, i)));
+            }
+            return jsonArr;
+        } else {
+            json jsonObj = json::object();
+            jsi::Array propNames = jsiObj.getPropertyNames(rt);
+            for (size_t i = 0; i < propNames.size(rt); ++i) {
+                jsi::String propName = propNames.getValueAtIndex(rt, i).asString(rt);
+                std::string key = propName.utf8(rt);
+                jsonObj[key] = jsiValueToJson(rt, jsiObj.getProperty(rt, propName));
+            }
+            return jsonObj;
+        }
+    }
+    // Should not happen for valid JSON-like structures
+    return nullptr;
+}
 // JSI method for completions (synchronous - kept for compatibility)
 jsi::Value LlamaCppModel::completionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
   if (count < 1 || !args[0].isObject()) {

package/cpp/LlamaCppModel.h CHANGED Viewed

@@ -21,6 +21,9 @@
 #include "rn-utils.hpp"
 #include "rn-llama.hpp"
+// Include json.hpp for json handling
+#include "nlohmann/json.hpp"
 namespace facebook::react {
 // Chat message structure for representing messages in a conversation
@@ -166,6 +169,8 @@ private:
   // Add CallInvoker for async operations
   std::shared_ptr<CallInvoker> jsInvoker_;
+  static json jsiValueToJson(jsi::Runtime& rt, const jsi::Value& val); // Declaration of new helper
 };
 } // namespace facebook::react

package/cpp/build-info.cpp CHANGED Viewed

@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 5541;
-char const *LLAMA_COMMIT = "07e4351c";
+int LLAMA_BUILD_NUMBER = 5572;
+char const *LLAMA_COMMIT = "7675c555";
 char const *LLAMA_COMPILER = "unknown";
 char const *LLAMA_BUILD_TARGET = "unknown";

package/cpp/llama.cpp/README.md CHANGED Viewed

@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Bindings</summary>
+- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)

package/cpp/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -58,23 +58,20 @@ add_library(${TARGET} STATIC
     arg.cpp
     arg.h
     base64.hpp
-    chat.cpp
-    chat.h
     chat-parser.cpp
     chat-parser.h
+    chat.cpp
+    chat.h
     common.cpp
     common.h
     console.cpp
     console.h
-    json-schema-to-grammar.cpp
-    json.hpp
-    json-partial.h
     json-partial.cpp
+    json-partial.h
+    json-schema-to-grammar.cpp
     llguidance.cpp
     log.cpp
     log.h
-    minja/chat-template.hpp
-    minja/minja.hpp
     ngram-cache.cpp
     ngram-cache.h
     regex-partial.cpp
@@ -147,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()
-target_include_directories(${TARGET} PUBLIC .)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

package/cpp/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -1,10 +1,11 @@
-#include "gguf.h" // for reading GGUF splits
 #include "arg.h"
+#include "chat.h"
 #include "common.h"
+#include "gguf.h" // for reading GGUF splits
+#include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "chat.h"
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -15,6 +16,9 @@
 #include <windows.h>
 #endif
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -34,8 +38,6 @@
 #include <future>
 #endif
-#include "json-schema-to-grammar.h"
 using json = nlohmann::ordered_json;
 std::initializer_list<enum llama_example> mmproj_examples = {
@@ -1346,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--prio"}, "N",
-        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
         [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
+            if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
                 throw std::invalid_argument("invalid value");
             }
             params.cpuparams.priority = (enum ggml_sched_priority) prio;

package/cpp/llama.cpp/common/chat-parser.cpp CHANGED Viewed

@@ -154,9 +154,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
             if (!rest.empty()) {
                 handle_reasoning(rest, /* closed */ !is_partial());
             }
-            if (!syntax_.thinking_forced_open) {
-                throw common_chat_msg_partial_exception(end_think);
-            }
+            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
+            // if (!syntax_.thinking_forced_open) {
+            //     throw common_chat_msg_partial_exception(end_think);
+            // }
             return true;
         }
     }

package/cpp/llama.cpp/common/chat-parser.h CHANGED Viewed

@@ -2,9 +2,10 @@
 #include "chat.h"
 #include "json-partial.h"
-#include "json.hpp"
 #include "regex-partial.h"
+#include <nlohmann/json.hpp>
 #include <optional>
 #include <string>
 #include <vector>

package/cpp/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -1,13 +1,14 @@
 #include "chat.h"
 #include "chat-parser.h"
 #include "common.h"
+#include "json-partial.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
-#include "json-partial.h"
-#include "minja/chat-template.hpp"
-#include "minja/minja.hpp"
 #include "regex-partial.h"
+#include <minja/chat-template.hpp>
+#include <minja/minja.hpp>
 #include <cstdio>
 #include <exception>
 #include <iostream>
@@ -16,7 +17,6 @@
 #include <string>
 #include <vector>
 static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
     auto time = std::chrono::system_clock::to_time_t(now);
     auto local_time = *std::localtime(&time);

package/cpp/llama.cpp/common/common.cpp CHANGED Viewed

@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     DWORD p = NORMAL_PRIORITY_CLASS;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
         case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
         case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
         case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     int p = 0;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
         case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
         case GGML_SCHED_PRIO_HIGH:     p = -10; break;

package/cpp/llama.cpp/common/json-partial.cpp CHANGED Viewed

@@ -1,9 +1,10 @@
-#include <json-partial.h>
-#include "ggml.h"
+#include "json-partial.h"
 #include "log.h"
-#include <string>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
+#include <string>
 using json = nlohmann::ordered_json;

package/cpp/llama.cpp/common/json-partial.h CHANGED Viewed

@@ -1,5 +1,6 @@
 #pragma once
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 // Healing marker (empty if the JSON was fully parsed / wasn't healed).
 struct common_healing_marker {

package/cpp/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -1,8 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"
+#include <nlohmann/json.hpp>
 #include <algorithm>
-#include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>

package/cpp/llama.cpp/common/json-schema-to-grammar.h CHANGED Viewed

@@ -1,9 +1,9 @@
 #pragma once
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json_fwd.hpp>
+#include <functional>
+#include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                    bool force_gbnf = false);

package/cpp/llama.cpp/convert_hf_to_gguf.py CHANGED Viewed

@@ -1047,6 +1047,10 @@ class TextModel(ModelBase):
         special_vocab.chat_template = "rwkv-world"
         # hack: Add '\n\n' as the EOT token to make it chat normally
         special_vocab._set_special_token("eot", 261)
+        # hack: Override these as they have already been set (incorrectly)
+        special_vocab.special_token_ids["bos"] = 0
+        special_vocab.special_token_ids["eos"] = 0
         special_vocab.add_to_gguf(self.gguf_writer)
     def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -3810,7 +3814,7 @@ class BertModel(TextModel):
             remove_whitespaces = tokenizer.clean_up_tokenization_spaces
             precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
-            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
         else:
             sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
             sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3823,7 +3827,7 @@ class BertModel(TextModel):
             tokenizer = SentencePieceProcessor()
             tokenizer.LoadFromFile(str(tokenizer_path))
-            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
@@ -3853,33 +3857,26 @@ class BertModel(TextModel):
             unk_token = tokenizer_config_json.get("unk_token")
             unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
-            for token_id in range(vocab_size):
+            for token_id in range(tokenizer.vocab_size):
                 piece = tokenizer._convert_id_to_token(token_id)
-                text = piece.encode("utf-8")
-                score = tokenizer_json["model"]["vocab"][token_id][1]
-                toktype = SentencePieceTokenTypes.NORMAL
-                if token_id == unk_token_id:
-                    toktype = SentencePieceTokenTypes.UNKNOWN
-                elif token_id in tokenizer.all_special_ids:
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif token_id in added_vocab.values():
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
-                # No reliable way to detect this, but jina doesn't have any
-                # elif tokenizer.IsByte(token_id):
-                #     toktype = SentencePieceTokenTypes.BYTE
-                tokens[token_id] = text
-                scores[token_id] = score
-                toktypes[token_id] = toktype
-        if vocab_size > len(tokens):
-            pad_count = vocab_size - len(tokens)
-            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
-            for i in range(1, pad_count + 1):
-                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
-                scores.append(-1000.0)
-                toktypes.append(SentencePieceTokenTypes.UNUSED)
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
+                    text = piece.encode("utf-8")
+                    score = tokenizer_json["model"]["vocab"][token_id][1]
+                    toktype = SentencePieceTokenTypes.NORMAL
+                    if token_id == unk_token_id:
+                        toktype = SentencePieceTokenTypes.UNKNOWN
+                    elif token_id in tokenizer.all_special_ids:
+                        toktype = SentencePieceTokenTypes.CONTROL
+                    elif token_id in added_vocab.values():
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
+                    # No reliable way to detect this, but jina doesn't have any
+                    # elif tokenizer.IsByte(token_id):
+                    #     toktype = SentencePieceTokenTypes.BYTE
+                    tokens[token_id] = text
+                    scores[token_id] = score
+                    toktypes[token_id] = toktype
         if isinstance(tokenizer, SentencePieceProcessor):
             # realign tokens (see HF tokenizer code)
@@ -3892,6 +3889,12 @@ class BertModel(TextModel):
                 SentencePieceTokenTypes.UNKNOWN,
             ] + toktypes[3:-1]
+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL
         self.gguf_writer.add_tokenizer_model("t5")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)

package/cpp/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -2095,9 +2095,6 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
     GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
@@ -2181,6 +2178,7 @@ extern "C" {
     // scheduling priorities
     enum ggml_sched_priority {
+        GGML_SCHED_PRIO_LOW = -1,
         GGML_SCHED_PRIO_NORMAL,
         GGML_SCHED_PRIO_MEDIUM,
         GGML_SCHED_PRIO_HIGH,