npm - llama-cpp-capacitor - Versions diffs - 0.0.3 → 0.0.4 - Mend

llama-cpp-capacitor 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +160 -126
package/android/src/main/jni.cpp +91 -9
package/package.json +1 -1

package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java CHANGED Viewed

@@ -51,6 +51,7 @@ class LlamaContext {
     private LlamaModel model;
     private boolean isMultimodalEnabled = false;
     private boolean isVocoderEnabled = false;
+    private long nativeContextId = -1;
     public LlamaContext(int id) {
         this.id = id;
@@ -83,6 +84,14 @@ class LlamaContext {
     public void setVocoderEnabled(boolean vocoderEnabled) {
         isVocoderEnabled = vocoderEnabled;
     }
+    public long getNativeContextId() {
+        return nativeContextId;
+    }
+    public void setNativeContextId(long nativeContextId) {
+        this.nativeContextId = nativeContextId;
+    }
 }
 class LlamaModel {
@@ -231,16 +240,33 @@ public class LlamaCpp {
     private int contextLimit = 10;
     private boolean nativeLogEnabled = false;
+    // Native method declarations
+    private native long initContextNative(String modelPath, JSObject params);
+    private native void releaseContextNative(long nativeContextId);
+    private native String completionNative(long contextId, String prompt);
+    private native void stopCompletionNative(long contextId);
+    private native String getFormattedChatNative(long contextId, String messages, String chatTemplate);
+    private native boolean toggleNativeLogNative(boolean enabled);
+    static {
+        System.loadLibrary("llama-cpp");
+    }
     // MARK: - Core initialization and management
     public void toggleNativeLog(boolean enabled, LlamaCallback<Void> callback) {
-        nativeLogEnabled = enabled;
-        if (enabled) {
-            Log.i(TAG, "Native logging enabled");
-        } else {
-            Log.i(TAG, "Native logging disabled");
+        try {
+            boolean result = toggleNativeLogNative(enabled);
+            nativeLogEnabled = enabled;
+            if (enabled) {
+                Log.i(TAG, "Native logging enabled");
+            } else {
+                Log.i(TAG, "Native logging disabled");
+            }
+            callback.onResult(LlamaResult.success(null));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Failed to toggle native log: " + e.getMessage())));
         }
-        callback.onResult(LlamaResult.success(null));
     }
     public void setContextLimit(int limit, LlamaCallback<Void> callback) {
@@ -268,89 +294,70 @@ public class LlamaCpp {
             return;
         }
-        // Extract parameters
-        String modelPath = params.getString("model");
-        if (modelPath == null) {
-            callback.onResult(LlamaResult.failure(new LlamaError("Invalid parameters")));
-            return;
+        try {
+            // Extract parameters
+            String modelPath = params.getString("model");
+            if (modelPath == null || modelPath.isEmpty()) {
+                callback.onResult(LlamaResult.failure(new LlamaError("Model path is required")));
+                return;
+            }
+            // Call native initialization
+            long nativeContextId = initContextNative(modelPath, params);
+            if (nativeContextId < 0) {
+                callback.onResult(LlamaResult.failure(new LlamaError("Failed to initialize native context")));
+                return;
+            }
+            // Create Java context wrapper
+            LlamaContext context = new LlamaContext(contextId);
+            context.setNativeContextId(nativeContextId);
+            contexts.put(contextId, context);
+            // Return context info
+            Map<String, Object> contextInfo = new HashMap<>();
+            contextInfo.put("contextId", contextId);
+            contextInfo.put("gpu", false);
+            contextInfo.put("reasonNoGPU", "Currently not supported");
+            Map<String, Object> modelInfo = new HashMap<>();
+            modelInfo.put("desc", "Loaded model");
+            modelInfo.put("size", 0);
+            modelInfo.put("nEmbd", 0);
+            modelInfo.put("nParams", 0);
+            modelInfo.put("path", modelPath);
+            contextInfo.put("model", modelInfo);
+            contextInfo.put("androidLib", "llama-cpp");
+            callback.onResult(LlamaResult.success(contextInfo));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Context initialization failed: " + e.getMessage())));
         }
-        // Create context
-        LlamaContext context = new LlamaContext(contextId);
-        // Create model info (this would typically load from GGUF file)
-        MinjaCaps defaultCaps = new MinjaCaps(true, true, true, true, true, true);
-        MinjaCaps toolUseCaps = new MinjaCaps(true, true, true, true, true, true);
-        MinjaTemplates minja = new MinjaTemplates(true, defaultCaps, true, toolUseCaps);
-        ChatTemplates chatTemplates = new ChatTemplates(true, minja);
-        LlamaModel model = new LlamaModel(
-            modelPath,
-            "Sample model",
-            0,
-            0,
-            0,
-            chatTemplates,
-            new HashMap<>()
-        );
-        context.setModel(model);
-        contexts.put(contextId, context);
-        // Return context info
-        Map<String, Object> contextInfo = new HashMap<>();
-        contextInfo.put("contextId", contextId);
-        contextInfo.put("gpu", false);
-        contextInfo.put("reasonNoGPU", "Not implemented");
-        Map<String, Object> modelInfo = new HashMap<>();
-        modelInfo.put("desc", model.getDesc());
-        modelInfo.put("size", model.getSize());
-        modelInfo.put("nEmbd", model.getNEmbd());
-        modelInfo.put("nParams", model.getNParams());
-        Map<String, Object> chatTemplatesInfo = new HashMap<>();
-        chatTemplatesInfo.put("llamaChat", model.getChatTemplates().isLlamaChat());
-        Map<String, Object> minjaInfo = new HashMap<>();
-        minjaInfo.put("default", model.getChatTemplates().getMinja().isDefault());
-        Map<String, Object> defaultCapsInfo = new HashMap<>();
-        defaultCapsInfo.put("tools", model.getChatTemplates().getMinja().getDefaultCaps().isTools());
-        defaultCapsInfo.put("toolCalls", model.getChatTemplates().getMinja().getDefaultCaps().isToolCalls());
-        defaultCapsInfo.put("toolResponses", model.getChatTemplates().getMinja().getDefaultCaps().isToolResponses());
-        defaultCapsInfo.put("systemRole", model.getChatTemplates().getMinja().getDefaultCaps().isSystemRole());
-        defaultCapsInfo.put("parallelToolCalls", model.getChatTemplates().getMinja().getDefaultCaps().isParallelToolCalls());
-        defaultCapsInfo.put("toolCallId", model.getChatTemplates().getMinja().getDefaultCaps().isToolCallId());
-        Map<String, Object> toolUseCapsInfo = new HashMap<>();
-        toolUseCapsInfo.put("tools", model.getChatTemplates().getMinja().getToolUseCaps().isTools());
-        toolUseCapsInfo.put("toolCalls", model.getChatTemplates().getMinja().getToolUseCaps().isToolCalls());
-        toolUseCapsInfo.put("toolResponses", model.getChatTemplates().getMinja().getToolUseCaps().isToolResponses());
-        toolUseCapsInfo.put("systemRole", model.getChatTemplates().getMinja().getToolUseCaps().isSystemRole());
-        toolUseCapsInfo.put("parallelToolCalls", model.getChatTemplates().getMinja().getToolUseCaps().isParallelToolCalls());
-        toolUseCapsInfo.put("toolCallId", model.getChatTemplates().getMinja().getToolUseCaps().isToolCallId());
-        minjaInfo.put("defaultCaps", defaultCapsInfo);
-        minjaInfo.put("toolUse", model.getChatTemplates().getMinja().isToolUse());
-        minjaInfo.put("toolUseCaps", toolUseCapsInfo);
-        chatTemplatesInfo.put("minja", minjaInfo);
-        modelInfo.put("chatTemplates", chatTemplatesInfo);
-        modelInfo.put("metadata", model.getMetadata());
-        modelInfo.put("isChatTemplateSupported", true);
-        contextInfo.put("model", modelInfo);
-        callback.onResult(LlamaResult.success(contextInfo));
     }
     public void releaseContext(int contextId, LlamaCallback<Void> callback) {
-        if (contexts.remove(contextId) == null) {
+        LlamaContext context = contexts.get(contextId);
+        if (context == null) {
             callback.onResult(LlamaResult.failure(new LlamaError("Context not found")));
             return;
         }
-        callback.onResult(LlamaResult.success(null));
+        try {
+            // Release native context
+            if (context.getNativeContextId() >= 0) {
+                releaseContextNative(context.getNativeContextId());
+            }
+            // Remove from Java context map
+            contexts.remove(contextId);
+            callback.onResult(LlamaResult.success(null));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Failed to release context: " + e.getMessage())));
+        }
     }
     public void releaseAllContexts(LlamaCallback<Void> callback) {
@@ -367,15 +374,22 @@ public class LlamaCpp {
             return;
         }
-        // This would typically format the chat using the model's chat templates
-        // For now, return a basic formatted chat
-        Map<String, Object> formattedChat = new HashMap<>();
-        formattedChat.put("type", "llama-chat");
-        formattedChat.put("prompt", messages);
-        formattedChat.put("has_media", false);
-        formattedChat.put("media_paths", new String[0]);
-        callback.onResult(LlamaResult.success(formattedChat));
+        try {
+            // Call native formatted chat
+            String result = getFormattedChatNative(context.getNativeContextId(), messages, chatTemplate);
+            // Build formatted chat result
+            Map<String, Object> formattedChat = new HashMap<>();
+            formattedChat.put("type", "llama-chat");
+            formattedChat.put("prompt", result);
+            formattedChat.put("has_media", false);
+            formattedChat.put("media_paths", new String[0]);
+            callback.onResult(LlamaResult.success(formattedChat));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Failed to format chat: " + e.getMessage())));
+        }
     }
     public void completion(int contextId, JSObject params, LlamaCallback<Map<String, Object>> callback) {
@@ -385,48 +399,68 @@ public class LlamaCpp {
             return;
         }
-        // This would typically perform the completion using llama.cpp
-        // For now, return a basic completion result
-        Map<String, Object> completionResult = new HashMap<>();
-        completionResult.put("text", "Sample completion text");
-        completionResult.put("reasoning_content", "");
-        completionResult.put("tool_calls", new Object[0]);
-        completionResult.put("content", "Sample completion text");
-        completionResult.put("chat_format", 0);
-        completionResult.put("tokens_predicted", 0);
-        completionResult.put("tokens_evaluated", 0);
-        completionResult.put("truncated", false);
-        completionResult.put("stopped_eos", false);
-        completionResult.put("stopped_word", "");
-        completionResult.put("stopped_limit", 0);
-        completionResult.put("stopping_word", "");
-        completionResult.put("context_full", false);
-        completionResult.put("interrupted", false);
-        completionResult.put("tokens_cached", 0);
-        Map<String, Object> timings = new HashMap<>();
-        timings.put("prompt_n", 0);
-        timings.put("prompt_ms", 0);
-        timings.put("prompt_per_token_ms", 0);
-        timings.put("prompt_per_second", 0);
-        timings.put("predicted_n", 0);
-        timings.put("predicted_ms", 0);
-        timings.put("predicted_per_token_ms", 0);
-        timings.put("predicted_per_second", 0);
-        completionResult.put("timings", timings);
-        callback.onResult(LlamaResult.success(completionResult));
+        try {
+            // Extract parameters from JSObject
+            String prompt = params.getString("prompt", "");
+            int nPredict = params.getInteger("n_predict", 128);
+            float temperature = params.has("temp") ? (float) params.getDouble("temp") : 0.8f;
+            float topP = params.has("top_p") ? (float) params.getDouble("top_p") : 0.95f;
+            int topK = params.getInteger("top_k", 40);
+            float repeatPenalty = params.has("repeat_penalty") ? (float) params.getDouble("repeat_penalty") : 1.1f;
+            // Call native completion
+            String result = completionNative(context.getNativeContextId(), prompt);
+            // Build completion result
+            Map<String, Object> completionResult = new HashMap<>();
+            completionResult.put("text", result);
+            completionResult.put("reasoning_content", "");
+            completionResult.put("tool_calls", new Object[0]);
+            completionResult.put("content", result);
+            completionResult.put("chat_format", 0);
+            completionResult.put("tokens_predicted", nPredict);
+            completionResult.put("tokens_evaluated", 0);
+            completionResult.put("truncated", false);
+            completionResult.put("stopped_eos", false);
+            completionResult.put("stopped_word", "");
+            completionResult.put("stopped_limit", 0);
+            completionResult.put("stopping_word", "");
+            completionResult.put("context_full", false);
+            completionResult.put("interrupted", false);
+            completionResult.put("tokens_cached", 0);
+            Map<String, Object> timings = new HashMap<>();
+            timings.put("prompt_n", 0);
+            timings.put("prompt_ms", 0);
+            timings.put("prompt_per_token_ms", 0);
+            timings.put("prompt_per_second", 0);
+            timings.put("predicted_n", nPredict);
+            timings.put("predicted_ms", 0);
+            timings.put("predicted_per_token_ms", 0);
+            timings.put("predicted_per_second", 0);
+            completionResult.put("timings", timings);
+            callback.onResult(LlamaResult.success(completionResult));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Completion failed: " + e.getMessage())));
+        }
     }
     public void stopCompletion(int contextId, LlamaCallback<Void> callback) {
-        if (contexts.get(contextId) == null) {
+        LlamaContext context = contexts.get(contextId);
+        if (context == null) {
             callback.onResult(LlamaResult.failure(new LlamaError("Context not found")));
             return;
         }
-        // This would typically stop any ongoing completion
-        callback.onResult(LlamaResult.success(null));
+        try {
+            stopCompletionNative(context.getNativeContextId());
+            callback.onResult(LlamaResult.success(null));
+        } catch (Exception e) {
+            callback.onResult(LlamaResult.failure(new LlamaError("Failed to stop completion: " + e.getMessage())));
+        }
     }
     // MARK: - Session management

package/android/src/main/jni.cpp CHANGED Viewed

@@ -123,7 +123,7 @@ jclass find_class(JNIEnv* env, const char* name) {
 }
 // Global context storage
-static std::map<jlong, std::unique_ptr<llama_rn_context>> contexts;
+static std::map<jlong, std::unique_ptr<rnllama::llama_rn_context>> contexts;
 static jlong next_context_id = 1;
 extern "C" {
@@ -136,15 +136,24 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_initContext(
         std::string model_path_str = jstring_to_string(env, model_path);
         // Create new context
-        auto context = std::make_unique<llama_rn_context>();
+        auto context = std::make_unique<rnllama::llama_rn_context>();
-        // Initialize common parameters (simplified)
+        // Initialize common parameters
         common_params cparams;
         cparams.model = model_path_str;
         cparams.n_ctx = 2048;
         cparams.n_batch = 512;
         cparams.n_threads = 4;
         cparams.n_gpu_layers = 0;
+        cparams.rope_freq_base = 10000.0f;
+        cparams.rope_freq_scale = 1.0f;
+        cparams.mul_mat_q = true;
+        cparams.f16_kv = true;
+        cparams.logits_all = false;
+        cparams.embedding = false;
+        cparams.use_mmap = true;
+        cparams.use_mlock = false;
+        cparams.numa = GGML_NUMA_STRATEGY_DISABLED;
         // Load model
         if (!context->loadModel(cparams)) {
@@ -195,8 +204,76 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_completion(
         std::string prompt_str = jstring_to_string(env, prompt);
-        // Simplified completion (placeholder implementation)
-        std::string result = "Generated text for: " + prompt_str;
+        // Get the context
+        rnllama::llama_rn_context* context = it->second.get();
+        // Initialize completion if not already done
+        if (!context->completion) {
+            context->completion = new rnllama::llama_rn_context_completion(context);
+        }
+        // Set up completion parameters
+        completion_params cparams;
+        cparams.prompt = prompt_str;
+        cparams.n_predict = 128;
+        cparams.n_keep = 0;
+        cparams.n_discard = -1;
+        cparams.n_probs = 0;
+        cparams.logit_bias.clear();
+        cparams.top_k = 40;
+        cparams.top_p = 0.95f;
+        cparams.tfs_z = 1.0f;
+        cparams.typical_p = 1.0f;
+        cparams.temp = 0.8f;
+        cparams.repeat_penalty = 1.1f;
+        cparams.repeat_last_n = 64;
+        cparams.frequency_penalty = 0.0f;
+        cparams.presence_penalty = 0.0f;
+        cparams.mirostat = 0;
+        cparams.mirostat_tau = 5.0f;
+        cparams.mirostat_eta = 0.1f;
+        cparams.penalize_nl = true;
+        cparams.grammar = "";
+        cparams.grammar_penalty.clear();
+        cparams.antiprompt.clear();
+        cparams.seed = -1;
+        cparams.ignore_eos = false;
+        cparams.stop_sequences.clear();
+        cparams.streaming = false;
+        // Perform completion
+        std::string result;
+        try {
+            // Tokenize the prompt
+            auto tokenize_result = context->tokenize(prompt_str, {});
+            // Set up completion
+            context->completion->rewind();
+            context->completion->beginCompletion();
+            // Process tokens
+            for (size_t i = 0; i < tokenize_result.tokens.size(); i++) {
+                llama_batch_add(&context->completion->embd, tokenize_result.tokens[i], i, {0}, false);
+            }
+            // Generate completion
+            std::string generated_text;
+            for (int i = 0; i < cparams.n_predict; i++) {
+                auto token_output = context->completion->nextToken();
+                if (token_output.tok == llama_token_eos(context->ctx)) {
+                    break;
+                }
+                std::string token_text = rnllama::tokens_to_output_formatted_string(context->ctx, token_output.tok);
+                generated_text += token_text;
+            }
+            result = generated_text;
+        } catch (const std::exception& e) {
+            LOGE("Completion error: %s", e.what());
+            result = "Error during completion: " + std::string(e.what());
+        }
         LOGI("Completion for context %lld: %s", context_id, prompt_str.c_str());
         return string_to_jstring(env, result);
@@ -215,7 +292,10 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_stopCompletion(
     try {
         auto it = contexts.find(context_id);
         if (it != contexts.end()) {
-            // Stop completion logic would go here
+            rnllama::llama_rn_context* context = it->second.get();
+            if (context->completion) {
+                context->completion->is_interrupted = true;
+            }
             LOGI("Stopped completion for context %lld", context_id);
         }
     } catch (const std::exception& e) {
@@ -238,8 +318,10 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_getFormattedChat(
         std::string messages_str = jstring_to_string(env, messages);
         std::string template_str = jstring_to_string(env, chat_template);
-        // Simplified chat formatting (placeholder implementation)
-        std::string result = "Formatted chat: " + messages_str;
+        rnllama::llama_rn_context* context = it->second.get();
+        // Format chat using the context's method
+        std::string result = context->getFormattedChat(messages_str, template_str);
         LOGI("Formatted chat for context %lld", context_id);
         return string_to_jstring(env, result);
@@ -256,7 +338,7 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_toggleNativeLog(
     JNIEnv* env, jobject thiz, jboolean enabled) {
     try {
-        rnllama_verbose = jboolean_to_bool(enabled);
+        rnllama::rnllama_verbose = jboolean_to_bool(enabled);
         LOGI("Native logging %s", enabled ? "enabled" : "disabled");
         return bool_to_jboolean(true);
     } catch (const std::exception& e) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "llama-cpp-capacitor",
-  "version": "0.0.3",
+  "version": "0.0.4",
   "description": "A native Capacitor plugin that embeds llama.cpp directly into mobile apps, enabling offline AI inference with comprehensive support for text generation, multimodal processing, TTS, LoRA adapters, and more.",
   "main": "dist/plugin.cjs.js",
   "module": "dist/esm/index.js",