npm - llama-cpp-capacitor - Versions diffs - 0.0.13 → 0.0.21 - Mend

llama-cpp-capacitor 0.0.13 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/LlamaCpp.podspec +17 -17
package/Package.swift +27 -27
package/README.md +717 -574
package/android/build.gradle +88 -69
package/android/src/main/AndroidManifest.xml +2 -2
package/android/src/main/CMakeLists-arm64.txt +131 -0
package/android/src/main/CMakeLists-x86_64.txt +135 -0
package/android/src/main/CMakeLists.txt +35 -52
package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
package/android/src/main/jni-utils.h +7 -7
package/android/src/main/jni.cpp +868 -127
package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
package/cpp/{rn-completion.h → cap-completion.h} +22 -11
package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
package/cpp/{rn-llama.h → cap-llama.h} +32 -20
package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
package/cpp/{rn-tts.h → cap-tts.h} +14 -14
package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
package/dist/docs.json +100 -3
package/dist/esm/definitions.d.ts +45 -2
package/dist/esm/definitions.js.map +1 -1
package/dist/esm/index.d.ts +22 -0
package/dist/esm/index.js +66 -3
package/dist/esm/index.js.map +1 -1
package/dist/plugin.cjs.js +71 -3
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +71 -3
package/dist/plugin.js.map +1 -1
package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
package/package.json +111 -110

package/cpp/{rn-completion.cpp → cap-completion.cpp} RENAMED Viewed

@@ -1,14 +1,15 @@
-#include "rn-completion.h"
-#include "rn-llama.h"
-#include "rn-tts.h"
-#include "rn-mtmd.hpp"
+#include "cap-completion.h"
+#include "cap-llama.h"
+#include "cap-tts.h"
+#include "cap-mtmd.hpp"
+#include <algorithm>  // For std::sort in speculative decoding
 // Include multimodal support
 #include "tools/mtmd/mtmd.h"
 #include "tools/mtmd/mtmd-helper.h"
 #include "tools/mtmd/clip.h"
-namespace rnllama {
+namespace capllama {
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
@@ -67,19 +68,19 @@ static std::vector<llama_token> format_rerank(const llama_vocab * vocab, const s
 }
 // Constructor
-llama_rn_context_completion::llama_rn_context_completion(llama_rn_context* parent)
+llama_cap_context_completion::llama_cap_context_completion(llama_cap_context* parent)
     : parent_ctx(parent) {
 }
 // Destructor
-llama_rn_context_completion::~llama_rn_context_completion() {
+llama_cap_context_completion::~llama_cap_context_completion() {
     if (ctx_sampling != nullptr) {
         common_sampler_free(ctx_sampling);
         ctx_sampling = nullptr;
     }
 }
-void llama_rn_context_completion::rewind() {
+void llama_cap_context_completion::rewind() {
     is_interrupted = false;
     parent_ctx->params.antiprompt.clear();
     parent_ctx->params.sampling.grammar.clear();
@@ -105,7 +106,7 @@ void llama_rn_context_completion::rewind() {
     }
 }
-bool llama_rn_context_completion::initSampling() {
+bool llama_cap_context_completion::initSampling() {
     if (ctx_sampling != nullptr) {
         common_sampler_free(ctx_sampling);
     }
@@ -113,7 +114,7 @@ bool llama_rn_context_completion::initSampling() {
     return ctx_sampling != nullptr;
 }
-void llama_rn_context_completion::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
+void llama_cap_context_completion::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
     const int n_left = parent_ctx->n_ctx - parent_ctx->params.n_keep;
     const int n_block_size = n_left / 2;
     const int erased_blocks = (prompt_tokens.size() - parent_ctx->params.n_keep - n_block_size) / n_block_size;
@@ -135,7 +136,7 @@ void llama_rn_context_completion::truncatePrompt(std::vector<llama_token> &promp
     prompt_tokens = new_tokens;
 }
-void llama_rn_context_completion::loadPrompt(const std::vector<std::string> &media_paths) {
+void llama_cap_context_completion::loadPrompt(const std::vector<std::string> &media_paths) {
     bool has_media = !media_paths.empty();
     if (!has_media) {
@@ -203,11 +204,11 @@ void llama_rn_context_completion::loadPrompt(const std::vector<std::string> &med
              n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
 }
-void llama_rn_context_completion::beginCompletion() {
+void llama_cap_context_completion::beginCompletion() {
     beginCompletion(COMMON_CHAT_FORMAT_CONTENT_ONLY, COMMON_REASONING_FORMAT_NONE, false);
 }
-void llama_rn_context_completion::beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open) {
+void llama_cap_context_completion::beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open) {
     // number of tokens to keep when resetting context
     n_remain = parent_ctx->params.n_predict;
     llama_perf_context_reset(parent_ctx->ctx);
@@ -218,11 +219,11 @@ void llama_rn_context_completion::beginCompletion(int chat_format, common_reason
     current_thinking_forced_open = thinking_forced_open;
 }
-void llama_rn_context_completion::endCompletion() {
+void llama_cap_context_completion::endCompletion() {
     is_predicting = false;
 }
-completion_token_output llama_rn_context_completion::nextToken()
+completion_token_output llama_cap_context_completion::nextToken()
 {
     completion_token_output result;
     result.tok = -1;
@@ -344,7 +345,7 @@ completion_token_output llama_rn_context_completion::nextToken()
     return result;
 }
-size_t llama_rn_context_completion::findStoppingStrings(const std::string &text, const size_t last_token_size,
+size_t llama_cap_context_completion::findStoppingStrings(const std::string &text, const size_t last_token_size,
                             const stop_type type)
 {
     size_t stop_pos = std::string::npos;
@@ -376,7 +377,7 @@ size_t llama_rn_context_completion::findStoppingStrings(const std::string &text,
     return stop_pos;
 }
-completion_token_output llama_rn_context_completion::doCompletion()
+completion_token_output llama_cap_context_completion::doCompletion()
 {
     completion_token_output token_with_probs = nextToken();
@@ -444,7 +445,7 @@ completion_token_output llama_rn_context_completion::doCompletion()
     return token_with_probs;
 }
-completion_partial_output llama_rn_context_completion::getPartialOutput(const std::string &token_text) {
+completion_partial_output llama_cap_context_completion::getPartialOutput(const std::string &token_text) {
     common_chat_syntax syntax;
     syntax.format = static_cast<common_chat_format>(current_chat_format);
     syntax.reasoning_format = current_reasoning_format;
@@ -463,7 +464,7 @@ completion_partial_output llama_rn_context_completion::getPartialOutput(const st
     return result;
 }
-std::vector<float> llama_rn_context_completion::getEmbedding(common_params &embd_params)
+std::vector<float> llama_cap_context_completion::getEmbedding(common_params &embd_params)
 {
     static const int n_embd = llama_model_n_embd(llama_get_model(parent_ctx->ctx));
     if (!embd_params.embedding)
@@ -489,7 +490,7 @@ std::vector<float> llama_rn_context_completion::getEmbedding(common_params &embd
     return out;
 }
-std::vector<float> llama_rn_context_completion::rerank(const std::string &query, const std::vector<std::string> &documents)
+std::vector<float> llama_cap_context_completion::rerank(const std::string &query, const std::vector<std::string> &documents)
 {
     std::vector<float> scores;
@@ -548,7 +549,7 @@ std::vector<float> llama_rn_context_completion::rerank(const std::string &query,
     return scores;
 }
-std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
+std::string llama_cap_context_completion::bench(int pp, int tg, int pl, int nr)
 {
     if (is_predicting) {
         LOG_ERROR("cannot benchmark while predicting", "");
@@ -563,7 +564,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
     double pp_std = 0;
     double tg_std = 0;
-    // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
+    // TODO: move batch into llama_cap_context (related https://github.com/mybigday/llama.rn/issues/30)
     llama_batch batch = llama_batch_init(
         std::min(pp, parent_ctx->params.n_ubatch), // max n_tokens is limited by n_ubatch
         0,                         // No embeddings
@@ -656,7 +657,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
         std::string("]");
 }
-void llama_rn_context_completion::processMedia(
+void llama_cap_context_completion::processMedia(
     const std::string &prompt,
     const std::vector<std::string> &media_paths
 ) {
@@ -678,4 +679,181 @@ void llama_rn_context_completion::processMedia(
     );
 }
-} // namespace rnllama
+// Speculative decoding implementation
+completion_token_output llama_cap_context_completion::nextTokenSpeculative() {
+    // Enable speculative mode
+    use_speculative = parent_ctx->isSpectulativeEnabled();
+    if (!use_speculative) {
+        // Fallback to regular token generation
+        return nextToken();
+    }
+    completion_token_output result;
+    // If we don't have drafted tokens, draft some
+    if (draft_tokens.empty()) {
+        draft_tokens = draftTokens(parent_ctx->speculative_samples);
+        n_drafted = draft_tokens.size();
+    }
+    // Try to verify and accept draft tokens
+    if (!draft_tokens.empty()) {
+        int accepted = verifyAndAcceptTokens(draft_tokens);
+        n_accepted += accepted;
+        if (accepted > 0) {
+            // Use the first accepted token
+            result.tok = draft_tokens[0];
+            draft_tokens.erase(draft_tokens.begin());
+            // Update context
+            embd.push_back(result.tok);
+            --n_remain;
+            num_tokens_predicted++;
+            has_next_token = parent_ctx->params.n_predict == -1 || n_remain != 0;
+            return result;
+        }
+    }
+    // If no tokens were accepted, fall back to regular sampling
+    draft_tokens.clear();
+    return nextToken();
+}
+std::vector<llama_token> llama_cap_context_completion::draftTokens(int n_draft) {
+    std::vector<llama_token> drafted;
+    // Check if draft model is available
+    if (!parent_ctx->draft_ctx || !parent_ctx->draft_model || !parent_ctx->isSpectulativeEnabled()) {
+        return drafted;  // Return empty vector - will fallback to regular decoding
+    }
+    // Copy current context to draft model
+    // Note: KV cache copying may not be available in all llama.cpp versions
+    // For now, we'll skip this optimization and let the draft model generate from scratch
+    // This is still effective for speculative decoding
+    // Generate draft tokens using the smaller model
+    for (int i = 0; i < n_draft; i++) {
+        // Create batch with current context
+        llama_batch batch = llama_batch_init(1, 0, 1);
+        if (!embd.empty()) {
+            llama_batch_add(&batch, embd.back(), n_past + i, {0}, true);
+        }
+        // Decode with draft model
+        if (llama_decode(parent_ctx->draft_ctx, batch) != 0) {
+            llama_batch_free(batch);
+            break;
+        }
+        // Sample from draft model (using faster, simpler sampling)
+        const float temp = 0.8f;  // Fixed temperature for draft
+        auto logits = llama_get_logits_ith(parent_ctx->draft_ctx, -1);
+        const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->draft_model));
+        // Simple sampling for draft model
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (int token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = {
+            candidates.data(),
+            candidates.size(),
+            -1,
+            false,
+        };
+        // Simple temperature sampling for draft model
+        for (int token_id = 0; token_id < n_vocab; token_id++) {
+            candidates[token_id].logit /= temp;  // Apply temperature
+        }
+        // Sort by logit (simple greedy sampling for draft)
+        std::sort(candidates.begin(), candidates.end(),
+                  [](const llama_token_data& a, const llama_token_data& b) {
+                      return a.logit > b.logit;
+                  });
+        llama_token token = candidates[0].id;  // Take top token
+        drafted.push_back(token);
+        // Clean up
+        llama_batch_free(batch);
+        // Stop if we hit EOS
+        const llama_vocab * vocab = llama_model_get_vocab(parent_ctx->draft_model);
+        if (llama_vocab_is_eog(vocab, token)) {
+            break;
+        }
+    }
+    return drafted;
+}
+int llama_cap_context_completion::verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens) {
+    if (draft_tokens.empty() || !parent_ctx->ctx) {
+        return 0;
+    }
+    int accepted = 0;
+    // Verify each draft token against the main model
+    for (size_t i = 0; i < draft_tokens.size(); i++) {
+        // Create batch for verification
+        llama_batch batch = llama_batch_init(1, 0, 1);
+        if (!embd.empty()) {
+            llama_batch_add(&batch, embd.back(), n_past + accepted, {0}, true);
+        }
+        // Decode with main model
+        if (llama_decode(parent_ctx->ctx, batch) != 0) {
+            llama_batch_free(batch);
+            break;
+        }
+        // Get logits from main model
+        auto logits = llama_get_logits_ith(parent_ctx->ctx, -1);
+        const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->model));
+        // Sample from main model
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (int token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = {
+            candidates.data(),
+            candidates.size(),
+            -1,
+            false,
+        };
+        // Apply sampling from main model using common_sampler
+        llama_token main_token = common_sampler_sample(ctx_sampling, parent_ctx->ctx, -1);
+        // Accept if tokens match
+        if (main_token == draft_tokens[i]) {
+            accepted++;
+            common_sampler_accept(ctx_sampling, main_token, true);
+        } else {
+            // Reject and stop verification
+            break;
+        }
+        llama_batch_free(batch);
+    }
+    return accepted;
+}
+} // namespace capllama

package/cpp/{rn-completion.h → cap-completion.h} RENAMED Viewed

@@ -1,5 +1,5 @@
-#ifndef RN_COMPLETION_H
-#define RN_COMPLETION_H
+#ifndef CAP_COMPLETION_H
+#define CAP_COMPLETION_H
 #include "common.h"
 #include "llama.h"
@@ -9,7 +9,7 @@
 using json = nlohmann::ordered_json;
-namespace rnllama {
+namespace capllama {
 // Utility functions
 static inline void llama_batch_clear(llama_batch *batch) {
@@ -17,9 +17,9 @@ static inline void llama_batch_clear(llama_batch *batch) {
 }
 // Forward declarations
-struct llama_rn_context;
+struct llama_cap_context;
-// Types defined in rn-llama.h (needed here for compilation)
+// Types defined in cap-llama.h (needed here for compilation)
 enum stop_type
 {
     STOP_FULL,
@@ -47,9 +47,9 @@ struct completion_partial_output
 };
 // Completion context class
-struct llama_rn_context_completion {
+struct llama_cap_context_completion {
     // Reference to parent context
-    llama_rn_context* parent_ctx;
+    llama_cap_context* parent_ctx;
     // Completion state fields
     bool is_predicting = false;
@@ -77,12 +77,18 @@ struct llama_rn_context_completion {
     // Sampling context
     common_sampler *ctx_sampling = nullptr;
+    // Speculative decoding state
+    std::vector<llama_token> draft_tokens;
+    int n_drafted = 0;
+    int n_accepted = 0;
+    bool use_speculative = false;
     // Constructor
-    llama_rn_context_completion(llama_rn_context* parent);
+    llama_cap_context_completion(llama_cap_context* parent);
     // Destructor
-    ~llama_rn_context_completion();
+    ~llama_cap_context_completion();
     // Completion processing methods
     void rewind();
@@ -93,9 +99,14 @@ struct llama_rn_context_completion {
     void beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open);
     void endCompletion();
     completion_token_output nextToken();
+    completion_token_output nextTokenSpeculative();  // NEW: Speculative version
     size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
     completion_token_output doCompletion();
     completion_partial_output getPartialOutput(const std::string &token_text);
+    // Speculative decoding methods
+    std::vector<llama_token> draftTokens(int n_draft);
+    int verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens);
     // Embedding methods
     std::vector<float> getEmbedding(common_params &embd_params);
@@ -111,6 +122,6 @@ struct llama_rn_context_completion {
     );
 };
-} // namespace rnllama
+} // namespace capllama
-#endif /* RN_COMPLETION_H */
+#endif /* CAP_COMPLETION_H */

package/cpp/{rn-llama.cpp → cap-llama.cpp} RENAMED Viewed

@@ -1,14 +1,14 @@
-#include "rn-llama.h"
-#include "rn-tts.h"
-#include "rn-mtmd.hpp"
-#include "rn-completion.h"
+#include "cap-llama.h"
+#include "cap-tts.h"
+#include "cap-mtmd.hpp"
+#include "cap-completion.h"
 // Include multimodal support
 #include "tools/mtmd/mtmd.h"
 #include "tools/mtmd/mtmd-helper.h"
 #include "tools/mtmd/clip.h"
-namespace rnllama {
+namespace capllama {
 static const std::vector<lm_ggml_type> kv_cache_types = {
     LM_GGML_TYPE_F32,
@@ -122,7 +122,7 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
 }
-llama_rn_context::~llama_rn_context() {
+llama_cap_context::~llama_cap_context() {
     if (completion != nullptr) {
         delete completion;
         completion = nullptr;
@@ -130,9 +130,10 @@ llama_rn_context::~llama_rn_context() {
     releaseMultimodal();
     releaseVocoder();
+    releaseDraftModel();  // Clean up speculative decoding resources
 }
-bool llama_rn_context::loadModel(common_params &params_)
+bool llama_cap_context::loadModel(common_params &params_)
 {
     params = params_;
     llama_init = common_init_from_params(params);
@@ -150,7 +151,7 @@ bool llama_rn_context::loadModel(common_params &params_)
     if (completion != nullptr) {
         delete completion;
     }
-    completion = new llama_rn_context_completion(this);
+    completion = new llama_cap_context_completion(this);
     // Initialize context shift flag
     LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
@@ -162,7 +163,7 @@ bool llama_rn_context::loadModel(common_params &params_)
 }
-bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
+bool llama_cap_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
     const char * tmpl = llama_model_chat_template(model, name);
     if (tmpl == nullptr) {
       return false;
@@ -170,7 +171,7 @@ bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *nam
     return common_chat_verify_template(tmpl, use_jinja);
 }
-common_chat_params llama_rn_context::getFormattedChatWithJinja(
+common_chat_params llama_cap_context::getFormattedChatWithJinja(
         const std::string& messages,
         const std::string& chat_template,
         const std::string& json_schema,
@@ -222,7 +223,7 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
     }
 }
-std::string llama_rn_context::getFormattedChat(
+std::string llama_cap_context::getFormattedChat(
   const std::string &messages,
   const std::string &chat_template
 ) const {
@@ -239,14 +240,14 @@ std::string llama_rn_context::getFormattedChat(
     }
 }
-llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
+llama_cap_tokenize_result llama_cap_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
   if (media_paths.size() > 0) {
       if (!isMultimodalEnabled()) {
           throw std::runtime_error("Multimodal is not enabled but media paths are provided");
       }
       auto result = tokenizeWithMedia(mtmd_wrapper, text, media_paths);
       mtmd_input_chunks_free(result.chunks);
-      llama_rn_tokenize_result tokenize_result = {
+      llama_cap_tokenize_result tokenize_result = {
           .tokens = result.tokens,
           .has_media = true,
           .bitmap_hashes = result.bitmap_hashes,
@@ -257,7 +258,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
   }
   std::vector<llama_token> text_tokens;
   text_tokens = common_tokenize(ctx, text, false);
-  llama_rn_tokenize_result tokenize_result = {
+  llama_cap_tokenize_result tokenize_result = {
       .tokens = text_tokens,
       .has_media = false,
       .bitmap_hashes = {},
@@ -267,7 +268,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
   return tokenize_result;
 }
-int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
+int llama_cap_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
     for (auto &la : lora) {
         la.ptr = llama_adapter_lora_init(model, la.path.c_str());
         if (la.ptr == nullptr) {
@@ -280,18 +281,18 @@ int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lo
     return 0;
 }
-void llama_rn_context::removeLoraAdapters() {
+void llama_cap_context::removeLoraAdapters() {
     this->lora.clear();
     common_set_adapter_lora(ctx, this->lora); // apply empty list
 }
-std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
+std::vector<common_adapter_lora_info> llama_cap_context::getLoadedLoraAdapters() {
     return this->lora;
 }
-bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
+bool llama_cap_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
     try {
-        mtmd_wrapper = new llama_rn_context_mtmd(mmproj_path, use_gpu, model, ctx, params, has_multimodal, params);
+        mtmd_wrapper = new llama_cap_context_mtmd(mmproj_path, use_gpu, model, ctx, params, has_multimodal, params);
         return true;
     } catch (const std::exception& e) {
         LOG_ERROR("[DEBUG] Failed to initialize multimodal: %s", e.what());
@@ -299,19 +300,19 @@ bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_g
     }
 }
-bool llama_rn_context::isMultimodalEnabled() const {
+bool llama_cap_context::isMultimodalEnabled() const {
     return mtmd_wrapper != nullptr && mtmd_wrapper->isEnabled(has_multimodal);
 }
-bool llama_rn_context::isMultimodalSupportVision() const {
+bool llama_cap_context::isMultimodalSupportVision() const {
     return isMultimodalEnabled() && mtmd_wrapper->supportVision();
 }
-bool llama_rn_context::isMultimodalSupportAudio() const {
+bool llama_cap_context::isMultimodalSupportAudio() const {
     return isMultimodalEnabled() && mtmd_wrapper->supportAudio();
 }
-void llama_rn_context::releaseMultimodal() {
+void llama_cap_context::releaseMultimodal() {
     if (mtmd_wrapper != nullptr) {
         delete mtmd_wrapper;
         mtmd_wrapper = nullptr;
@@ -319,9 +320,9 @@ void llama_rn_context::releaseMultimodal() {
     }
 }
-bool llama_rn_context::initVocoder(const std::string &vocoder_model_path, int batch_size) {
+bool llama_cap_context::initVocoder(const std::string &vocoder_model_path, int batch_size) {
     try {
-        tts_wrapper = new llama_rn_context_tts(vocoder_model_path, batch_size);
+        tts_wrapper = new llama_cap_context_tts(vocoder_model_path, batch_size);
         has_vocoder = true;
         return true;
     } catch (const std::exception& e) {
@@ -330,11 +331,11 @@ bool llama_rn_context::initVocoder(const std::string &vocoder_model_path, int ba
     }
 }
-bool llama_rn_context::isVocoderEnabled() const {
+bool llama_cap_context::isVocoderEnabled() const {
     return has_vocoder && tts_wrapper != nullptr;
 }
-void llama_rn_context::releaseVocoder() {
+void llama_cap_context::releaseVocoder() {
     if (tts_wrapper != nullptr) {
         delete tts_wrapper;
         tts_wrapper = nullptr;
@@ -342,4 +343,57 @@ void llama_rn_context::releaseVocoder() {
     has_vocoder = false;
 }
+// Speculative decoding methods
+bool llama_cap_context::loadDraftModel(const std::string &draft_model_path) {
+    if (draft_model_path.empty()) {
+        return false;
+    }
+    // Create draft model parameters (based on main model params)
+    common_params draft_params = params;
+    draft_params.model.path = draft_model_path;
+    // Mobile optimization: smaller context for draft model
+    if (mobile_speculative) {
+        draft_params.n_ctx = std::min(params.n_ctx, 1024);  // Limit draft context
+        draft_params.n_batch = std::min(params.n_batch, 128);  // Smaller batch
+    }
+    try {
+        // For now, use simplified draft model initialization
+        // This would be expanded in a full implementation to properly initialize
+        // the draft model and context
+        // TODO: Implement proper draft model loading
+        // draft_model = llama_load_model_from_file(draft_model_path.c_str(), draft_params);
+        // draft_ctx = llama_new_context_with_model(draft_model, draft_params);
+        // For this implementation, we'll disable speculative decoding
+        // until proper model loading is implemented
+        printf("Draft model loading not yet implemented - falling back to regular decoding\n");
+        speculative_enabled = false;
+        return false;
+    } catch (const std::exception& e) {
+        printf("Failed to load draft model: %s\n", e.what());
+        releaseDraftModel();
+    }
+    return false;
+}
+void llama_cap_context::releaseDraftModel() {
+    if (draft_ctx) {
+        // Note: draft_ctx and draft_model are managed by common_init_result
+        // They will be automatically cleaned up
+        draft_ctx = nullptr;
+        draft_model = nullptr;
+    }
+    speculative_enabled = false;
+}
+bool llama_cap_context::isSpectulativeEnabled() const {
+    return speculative_enabled && draft_model && draft_ctx;
+}
 }