npm - llama-cpp-capacitor - Versions diffs - 0.0.13 → 0.0.21 - Mend

llama-cpp-capacitor 0.0.13 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/LlamaCpp.podspec +17 -17
package/Package.swift +27 -27
package/README.md +717 -574
package/android/build.gradle +88 -69
package/android/src/main/AndroidManifest.xml +2 -2
package/android/src/main/CMakeLists-arm64.txt +131 -0
package/android/src/main/CMakeLists-x86_64.txt +135 -0
package/android/src/main/CMakeLists.txt +35 -52
package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
package/android/src/main/jni-utils.h +7 -7
package/android/src/main/jni.cpp +868 -127
package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
package/cpp/{rn-completion.h → cap-completion.h} +22 -11
package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
package/cpp/{rn-llama.h → cap-llama.h} +32 -20
package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
package/cpp/{rn-tts.h → cap-tts.h} +14 -14
package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
package/dist/docs.json +100 -3
package/dist/esm/definitions.d.ts +45 -2
package/dist/esm/definitions.js.map +1 -1
package/dist/esm/index.d.ts +22 -0
package/dist/esm/index.js +66 -3
package/dist/esm/index.js.map +1 -1
package/dist/plugin.cjs.js +71 -3
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +71 -3
package/dist/plugin.js.map +1 -1
package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
package/package.json +111 -110

package/cpp/{rn-llama.h → cap-llama.h} RENAMED Viewed

@@ -1,5 +1,5 @@
-#ifndef RNLLAMA_H
-#define RNLLAMA_H
+#ifndef CAPLLAMA_H
+#define CAPLLAMA_H
 #include <sstream>
 #include <iostream>
@@ -14,14 +14,14 @@
 #include "llama-impl.h"
 #include "sampling.h"
 #include "nlohmann/json.hpp"
-#include "rn-tts.h"
+#include "cap-tts.h"
 #if defined(__ANDROID__)
 #include <android/log.h>
 #endif
 using json = nlohmann::ordered_json;
-namespace rnllama {
+namespace capllama {
 std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
@@ -29,17 +29,17 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
 lm_ggml_type kv_cache_type_from_str(const std::string & s);
-// Forward declarations - actual definitions are in rn-completion.h
+// Forward declarations - actual definitions are in cap-completion.h
 // Note: enum forward declarations not allowed in C++, using include in implementation file
 struct completion_token_output;
 struct completion_partial_output;
-struct llama_rn_context_mtmd;
+struct llama_cap_context_mtmd;
-struct llama_rn_context_tts;
+struct llama_cap_context_tts;
-struct llama_rn_context_completion;
+struct llama_cap_context_completion;
-struct llama_rn_tokenize_result {
+struct llama_cap_tokenize_result {
   std::vector<llama_token> tokens;
   bool has_media = false;
   std::vector<std::string> bitmap_hashes;
@@ -48,7 +48,7 @@ struct llama_rn_tokenize_result {
 };
 // Main context class
-struct llama_rn_context {
+struct llama_cap_context {
     // Model state fields
     llama_model *model = nullptr;
     float loading_progress = 0;
@@ -59,13 +59,25 @@ struct llama_rn_context {
     common_chat_templates_ptr templates;
     int n_ctx;
+    // Speculative decoding fields
+    llama_model *draft_model = nullptr;
+    llama_context *draft_ctx = nullptr;
+    bool speculative_enabled = false;
+    int speculative_samples = 3;  // Mobile-optimized default
+    bool mobile_speculative = true;
     // Completion context
-    llama_rn_context_completion *completion = nullptr;
+    llama_cap_context_completion *completion = nullptr;
-    ~llama_rn_context();
+    ~llama_cap_context();
     bool loadModel(common_params &params_);
+    // Speculative decoding methods
+    bool loadDraftModel(const std::string &draft_model_path);
+    void releaseDraftModel();
+    bool isSpectulativeEnabled() const;
     // Model methods
     bool validateModelChatTemplate(bool use_jinja, const char *name) const;
     common_chat_params getFormattedChatWithJinja(
@@ -84,7 +96,7 @@ struct llama_rn_context {
       const std::string &messages,
       const std::string &chat_template
     ) const;
-    llama_rn_tokenize_result tokenize(const std::string &text, const std::vector<std::string> &media_paths);
+    llama_cap_tokenize_result tokenize(const std::string &text, const std::vector<std::string> &media_paths);
     // Lora methods
     std::vector<common_adapter_lora_info> lora;
@@ -93,7 +105,7 @@ struct llama_rn_context {
     std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
     // Multimodal fields and methods
-    llama_rn_context_mtmd *mtmd_wrapper = nullptr;
+    llama_cap_context_mtmd *mtmd_wrapper = nullptr;
     bool has_multimodal = false;
     bool initMultimodal(const std::string &mmproj_path, bool use_gpu);
     bool isMultimodalEnabled() const;
@@ -102,7 +114,7 @@ struct llama_rn_context {
     void releaseMultimodal();
     // TTS fields and methods (delegated to TTS context)
-    llama_rn_context_tts *tts_wrapper = nullptr;
+    llama_cap_context_tts *tts_wrapper = nullptr;
     bool has_vocoder = false;
     bool initVocoder(const std::string &vocoder_model_path, int batch_size = -1);
     bool isVocoderEnabled() const;
@@ -125,15 +137,15 @@ inline void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
 void log(const char *level, const char *function, int line, const char *format, ...);
 // Logging macros
-extern bool rnllama_verbose;
+extern bool capllama_verbose;
-#if RNLLAMA_VERBOSE != 1
+#if CAPLLAMA_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                       \
     do                                                              \
     {                                                               \
-        if (rnllama_verbose)                                        \
+        if (capllama_verbose)                                        \
         {                                                           \
             log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
         }                                                           \
@@ -144,6 +156,6 @@ extern bool rnllama_verbose;
 #define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
 #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
-} // namespace rnllama
+} // namespace capllama
-#endif /* RNLLAMA_H */
+#endif /* CAPLLAMA_H */

package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} RENAMED Viewed

@@ -1,6 +1,6 @@
 #pragma once
-#include "rn-llama.h"
+#include "cap-llama.h"
 #include "tools/mtmd/mtmd.h"
 #include "tools/mtmd/mtmd-helper.h"
 #include "tools/mtmd/clip.h"
@@ -8,17 +8,17 @@
 #include <vector>
 #include <cstdint>
-namespace rnllama {
+namespace capllama {
 // MTMD context structure
-struct llama_rn_context_mtmd {
+struct llama_cap_context_mtmd {
     mtmd_context *mtmd_ctx = nullptr;
     // State fields
     std::vector<std::string> bitmap_past_hashes;
     // Constructor - Initialize multimodal
-    llama_rn_context_mtmd(
+    llama_cap_context_mtmd(
         const std::string &mmproj_path,
         bool use_gpu,
         llama_model *model,
@@ -29,7 +29,7 @@ struct llama_rn_context_mtmd {
     );
     // Destructor - Release multimodal resources
-    ~llama_rn_context_mtmd();
+    ~llama_cap_context_mtmd();
     // Process media
     void processMedia(
@@ -149,11 +149,11 @@ struct mtmd_tokenize_result {
     mtmd_input_chunks* chunks = nullptr;
 };
-// Forward declaration for llama_rn_context
-struct llama_rn_context;
+// Forward declaration for llama_cap_context
+struct llama_cap_context;
 // Tokenize text with media function
-inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
+inline mtmd_tokenize_result tokenizeWithMedia(llama_cap_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
     mtmd_tokenize_result result;
     mtmd::bitmaps bitmaps;
@@ -369,7 +369,7 @@ inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrappe
     return result;
 }
-inline void llama_rn_context_mtmd::processMedia(
+inline void llama_cap_context_mtmd::processMedia(
     llama_context *ctx,
     const std::string &prompt,
     const std::vector<std::string> &media_paths,
@@ -524,7 +524,7 @@ inline void llama_rn_context_mtmd::processMedia(
     mtmd_input_chunks_free(chunks);
 }
-inline llama_rn_context_mtmd::llama_rn_context_mtmd(
+inline llama_cap_context_mtmd::llama_cap_context_mtmd(
     const std::string &mmproj_path,
     bool use_gpu,
     llama_model *model,
@@ -580,23 +580,23 @@ inline llama_rn_context_mtmd::llama_rn_context_mtmd(
     LOG_INFO("Context shifting disabled for multimodal support");
 }
-inline llama_rn_context_mtmd::~llama_rn_context_mtmd() {
+inline llama_cap_context_mtmd::~llama_cap_context_mtmd() {
     if (mtmd_ctx != nullptr) {
         mtmd_free(mtmd_ctx);
         mtmd_ctx = nullptr;
     }
 }
-inline bool llama_rn_context_mtmd::isEnabled(bool has_multimodal) const {
+inline bool llama_cap_context_mtmd::isEnabled(bool has_multimodal) const {
     return has_multimodal && this != nullptr;
 }
-inline bool llama_rn_context_mtmd::supportVision() const {
+inline bool llama_cap_context_mtmd::supportVision() const {
     return mtmd_ctx != nullptr && mtmd_support_vision(mtmd_ctx);
 }
-inline bool llama_rn_context_mtmd::supportAudio() const {
+inline bool llama_cap_context_mtmd::supportAudio() const {
     return mtmd_ctx != nullptr && mtmd_support_audio(mtmd_ctx);
 }
-} // namespace rnllama
+} // namespace capllama

package/cpp/{rn-tts.cpp → cap-tts.cpp} RENAMED Viewed

@@ -1,5 +1,5 @@
-#include "rn-tts.h"
-#include "rn-llama.h"
+#include "cap-tts.h"
+#include "cap-llama.h"
 #include "anyascii.h"
 #include "common.h"
 #include <regex>
@@ -11,7 +11,7 @@
 #include <thread>
 #include <cmath>
-namespace rnllama {
+namespace capllama {
 // Constants definitions
 const std::string default_audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
@@ -269,7 +269,7 @@ std::string audio_data_from_speaker(json speaker, const tts_type type) {
 }
 // Constructor and destructor implementations
-llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size) {
+llama_cap_context_tts::llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size) {
   common_params vocoder_params;
   vocoder_params.model.path = vocoder_model_path;
   vocoder_params.embedding = true;
@@ -291,14 +291,14 @@ llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path
   type = UNKNOWN; // Will be determined when used
 }
-llama_rn_context_tts::~llama_rn_context_tts() {
+llama_cap_context_tts::~llama_cap_context_tts() {
   // init_result will handle cleanup automatically when it goes out of scope
   model = nullptr;
   ctx = nullptr;
   type = UNKNOWN;
 }
-void llama_rn_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
+void llama_cap_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
     guide_tokens = tokens;
 }
@@ -456,8 +456,8 @@ std::vector<float> embd_to_audio(
     return audio;
 }
-// Forward declarations from rn-llama.h
-extern bool rnllama_verbose;
+// Forward declarations from cap-llama.h
+extern bool capllama_verbose;
 void log(const char *level, const char *function, int line, const char *format, ...);
 #define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
@@ -465,7 +465,7 @@ void log(const char *level, const char *function, int line, const char *format,
 #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
 // TTS member functions
-tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speaker) {
+tts_type llama_cap_context_tts::getTTSType(llama_cap_context* main_ctx, json speaker) {
     if (speaker.is_object() && speaker.contains("version")) {
         std::string version = speaker["version"].get<std::string>();
         if (version == "0.2") {
@@ -489,7 +489,7 @@ tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speak
     return OUTETTS_V0_2;
 }
-llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
+llama_cap_audio_completion_result llama_cap_context_tts::getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
     std::string audio_text = default_audio_text;
     std::string audio_data = default_audio_data;
@@ -522,7 +522,7 @@ llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompleti
     }
 }
-std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak) {
+std::vector<llama_token> llama_cap_context_tts::getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak) {
     const llama_vocab * vocab = llama_model_get_vocab(main_ctx->model);
     const tts_type tts_type = getTTSType(main_ctx);
     std::string clean_text = process_text(text_to_speak, tts_type);
@@ -557,7 +557,7 @@ std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(lla
     return result;
 }
-std::vector<float> llama_rn_context_tts::decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens) {
+std::vector<float> llama_cap_context_tts::decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens) {
     std::vector<llama_token> tokens_audio = tokens;
     tts_type tts_type = getTTSType(main_ctx);
     if (tts_type == OUTETTS_V0_3 || tts_type == OUTETTS_V0_2) {

package/cpp/{rn-tts.h → cap-tts.h} RENAMED Viewed

@@ -1,5 +1,5 @@
-#ifndef RNTTS_H
-#define RNTTS_H
+#ifndef CAPTTS_H
+#define CAPTTS_H
 #include <vector>
 #include <string>
@@ -9,10 +9,10 @@
 using json = nlohmann::ordered_json;
-namespace rnllama {
+namespace capllama {
 // Forward declarations
-struct llama_rn_context;
+struct llama_cap_context;
 // TTS type enumeration
 enum tts_type {
@@ -23,19 +23,19 @@ enum tts_type {
 };
 // Audio completion result structure
-struct llama_rn_audio_completion_result {
+struct llama_cap_audio_completion_result {
     std::string prompt;
     const char *grammar;
 };
 // TTS context for TTS-specific functionality
-struct llama_rn_context_tts {
+struct llama_cap_context_tts {
     // TTS state fields
     std::vector<llama_token> audio_tokens;
     std::vector<llama_token> guide_tokens;
     bool next_token_uses_guide_token = true;
-    // Vocoder fields (from llama_rn_context_vocoder)
+    // Vocoder fields (from llama_cap_context_vocoder)
     common_init_result init_result;
     common_params params;
     llama_model *model = nullptr;
@@ -43,17 +43,17 @@ struct llama_rn_context_tts {
     tts_type type = UNKNOWN;
     // Constructor and destructor
-    llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
-    ~llama_rn_context_tts();
+    llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
+    ~llama_cap_context_tts();
     // TTS utility methods
-    tts_type getTTSType(llama_rn_context* main_ctx, json speaker = nullptr);
-    llama_rn_audio_completion_result getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
-    std::vector<llama_token> getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak);
-    std::vector<float> decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens);
+    tts_type getTTSType(llama_cap_context* main_ctx, json speaker = nullptr);
+    llama_cap_audio_completion_result getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
+    std::vector<llama_token> getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak);
+    std::vector<float> decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens);
     void setGuideTokens(const std::vector<llama_token> &tokens);
 };
 }
-#endif /* RNTTS_H */
+#endif /* CAPTTS_H */

package/cpp/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -78,6 +78,36 @@ struct lm_ggml_compute_params {
 #include <sys/prctl.h>
 #endif
+// NEON compatibility layer
+#if defined(__ARM_NEON)
+    #include <arm_neon.h>
+    // Only define vcvtnq_s32_f32 for older ARM architectures that don't have it
+    // NDK 29+ includes this function for ARMv8 and newer
+    #if !defined(__aarch64__) && !defined(__ARM_ARCH_8A__) && defined(__ARM_ARCH) && __ARM_ARCH < 8
+        // Emulate vcvtn (round to nearest) for ARMv7
+        inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+            // Round to nearest integer
+            const float32x4_t vhalf = vdupq_n_f32(0.5f);
+            const float32x4_t vnhalf = vdupq_n_f32(-0.5f);
+            const float32x4_t vzero = vdupq_n_f32(0.0f);
+            uint32x4_t mask = vcgeq_f32(v, vzero);
+            float32x4_t rounded = vbslq_f32(mask, vaddq_f32(v, vhalf), vsubq_f32(v, vhalf));
+            return vcvtq_s32_f32(rounded);
+        }
+    #endif
+#endif
+#if defined(__s390x__) && defined(LM_GGML_NNPA)
+#ifndef __NNPA__
+#define __NNPA__
+#endif  // __NNPA__
+#endif  // __s390x__ && LM_GGML_NNPA
+#if defined(__ARM_FEATURE_SVE)
+#include <sys/prctl.h>
+#endif
 #if defined(__ARM_NEON)
 // ref: https://github.com/ggml-org/llama.cpp/pull/5404

package/dist/docs.json CHANGED Viewed

@@ -493,6 +493,82 @@
         "complexTypes": [],
         "slug": "releasevocoder"
       },
+      {
+        "name": "downloadModel",
+        "signature": "(options: { url: string; filename: string; }) => Promise<string>",
+        "parameters": [
+          {
+            "name": "options",
+            "docs": "",
+            "type": "{ url: string; filename: string; }"
+          }
+        ],
+        "returns": "Promise<string>",
+        "tags": [],
+        "docs": "",
+        "complexTypes": [],
+        "slug": "downloadmodel"
+      },
+      {
+        "name": "getDownloadProgress",
+        "signature": "(options: { url: string; }) => Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string; localPath?: string; downloadedBytes: number; totalBytes: number; }>",
+        "parameters": [
+          {
+            "name": "options",
+            "docs": "",
+            "type": "{ url: string; }"
+          }
+        ],
+        "returns": "Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string | undefined; localPath?: string | undefined; downloadedBytes: number; totalBytes: number; }>",
+        "tags": [],
+        "docs": "",
+        "complexTypes": [],
+        "slug": "getdownloadprogress"
+      },
+      {
+        "name": "cancelDownload",
+        "signature": "(options: { url: string; }) => Promise<boolean>",
+        "parameters": [
+          {
+            "name": "options",
+            "docs": "",
+            "type": "{ url: string; }"
+          }
+        ],
+        "returns": "Promise<boolean>",
+        "tags": [],
+        "docs": "",
+        "complexTypes": [],
+        "slug": "canceldownload"
+      },
+      {
+        "name": "getAvailableModels",
+        "signature": "() => Promise<Array<{ name: string; path: string; size: number; }>>",
+        "parameters": [],
+        "returns": "Promise<{ name: string; path: string; size: number; }[]>",
+        "tags": [],
+        "docs": "",
+        "complexTypes": [
+          "Array"
+        ],
+        "slug": "getavailablemodels"
+      },
+      {
+        "name": "convertJsonSchemaToGrammar",
+        "signature": "(options: { schema: string; }) => Promise<string>",
+        "parameters": [
+          {
+            "name": "options",
+            "docs": "",
+            "type": "{ schema: string; }"
+          }
+        ],
+        "returns": "Promise<string>",
+        "tags": [],
+        "docs": "",
+        "complexTypes": [],
+        "slug": "convertjsonschematogrammar"
+      },
       {
         "name": "addListener",
         "signature": "(eventName: string, listenerFunc: (data: any) => void) => Promise<void>",
@@ -4031,6 +4107,27 @@
           "complexTypes": [],
           "type": "number | undefined"
         },
+        {
+          "name": "draft_model",
+          "tags": [],
+          "docs": "Path to draft model for speculative decoding (mobile optimization)",
+          "complexTypes": [],
+          "type": "string | undefined"
+        },
+        {
+          "name": "speculative_samples",
+          "tags": [],
+          "docs": "Number of tokens to predict speculatively (default: 3 for mobile)",
+          "complexTypes": [],
+          "type": "number | undefined"
+        },
+        {
+          "name": "mobile_speculative",
+          "tags": [],
+          "docs": "Enable mobile-optimized speculative decoding",
+          "complexTypes": [],
+          "type": "boolean | undefined"
+        },
         {
           "name": "n_gpu_layers",
           "tags": [],
@@ -4263,11 +4360,11 @@
         {
           "name": "tool_calls",
           "tags": [],
-          "docs": "Tool calls",
+          "docs": "Tool calls (parsed from response)",
           "complexTypes": [
             "Array"
           ],
-          "type": "Array<{\r\n    type: 'function';\r\n    function: {\r\n      name: string;\r\n      arguments: string;\r\n    };\r\n    id?: string;\r\n  }>"
+          "type": "Array<{\r\n    type: 'function';\r\n    function: {\r\n      name: string;\r\n      arguments: string; // JSON string of arguments\r\n    };\r\n    id?: string;\r\n  }>"
         },
         {
           "name": "content",
@@ -4535,7 +4632,7 @@
         {
           "name": "grammar",
           "tags": [],
-          "docs": "Set grammar for grammar-based sampling.  Default: no grammar",
+          "docs": "Set grammar for grammar-based sampling (GBNF format). Default: no grammar\r\nThis will override json_schema if both are provided.",
           "complexTypes": [],
           "type": "string | undefined"
         },

package/dist/esm/definitions.d.ts CHANGED Viewed

@@ -13,6 +13,18 @@ export interface NativeContextParams {
     n_batch?: number;
     n_ubatch?: number;
     n_threads?: number;
+    /**
+     * Path to draft model for speculative decoding (mobile optimization)
+     */
+    draft_model?: string;
+    /**
+     * Number of tokens to predict speculatively (default: 3 for mobile)
+     */
+    speculative_samples?: number;
+    /**
+     * Enable mobile-optimized speculative decoding
+     */
+    mobile_speculative?: boolean;
     /**
      * Number of layers to store in VRAM (Currently only for iOS)
      */
@@ -87,7 +99,8 @@ export interface NativeCompletionParams {
      */
     json_schema?: string;
     /**
-     * Set grammar for grammar-based sampling.  Default: no grammar
+     * Set grammar for grammar-based sampling (GBNF format). Default: no grammar
+     * This will override json_schema if both are provided.
      */
     grammar?: string;
     /**
@@ -268,7 +281,7 @@ export interface NativeCompletionResult {
      */
     reasoning_content: string;
     /**
-     * Tool calls
+     * Tool calls (parsed from response)
      */
     tool_calls: Array<{
         type: 'function';
@@ -483,6 +496,10 @@ export interface CompletionParams extends Omit<NativeCompletionParams, 'emit_par
     chatTemplate?: string;
     chat_template?: string;
     jinja?: boolean;
+    /**
+     * GBNF grammar for structured output. Takes precedence over json_schema.
+     */
+    grammar?: string;
     tools?: object;
     parallel_tool_calls?: object;
     tool_choice?: string;
@@ -648,6 +665,32 @@ export interface LlamaCppPlugin {
     releaseVocoder(options: {
         contextId: number;
     }): Promise<void>;
+    downloadModel(options: {
+        url: string;
+        filename: string;
+    }): Promise<string>;
+    getDownloadProgress(options: {
+        url: string;
+    }): Promise<{
+        progress: number;
+        completed: boolean;
+        failed: boolean;
+        errorMessage?: string;
+        localPath?: string;
+        downloadedBytes: number;
+        totalBytes: number;
+    }>;
+    cancelDownload(options: {
+        url: string;
+    }): Promise<boolean>;
+    getAvailableModels(): Promise<Array<{
+        name: string;
+        path: string;
+        size: number;
+    }>>;
+    convertJsonSchemaToGrammar(options: {
+        schema: string;
+    }): Promise<string>;
     addListener(eventName: string, listenerFunc: (data: any) => void): Promise<void>;
     removeAllListeners(eventName: string): Promise<void>;
 }