npm - react-native-litert-lm - Versions diffs - 0.1.0 → 0.2.0 - Mend

react-native-litert-lm 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md CHANGED Viewed

@@ -9,12 +9,13 @@ High-performance LLM inference for React Native powered by [LiteRT-LM](https://g
 - ⚡ **GPU Acceleration** - GPU delegate (Android), Metal (iOS when available)
 - 📦 **Bundled Tokenizer** - No separate tokenization library needed
 - 🔄 **Streaming Support** - Token-by-token generation callbacks
-- 📱 **Cross-Platform** - Android API 26+ (iOS coming soon)
-- 🚧 **Multimodal** - Image and audio input (Coming Soon to Android)
+- 📱 **Cross-Platform** - Android API 26+
+- 🖼️ **Multimodal** - Image and audio input support (Android Beta, iOS coming soon)
+- 🧵 **Async API** - Non-blocking inference to prevent UI freezes
 ## Status
-> ⚠️ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub repository](https://github.com/litert-community/react-native-litert-lm).
+> ⚠️ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub issues](https://github.com/hung-yueh/react-native-litert-lm/issues).
 ## Installation
@@ -86,15 +87,15 @@ import { createLLM } from "react-native-litert-lm";
 const llm = createLLM();
-// Load a Gemma 3n model
-llm.loadModel("/path/to/gemma-3n-e2b.litertlm", {
+// Load a Gemma 3n model (async)
+await llm.loadModel("/path/to/gemma-3n-e2b.litertlm", {
   backend: "gpu",
   temperature: 0.7,
   maxTokens: 512,
 });
-// Generate response
-const response = llm.sendMessage("What is the capital of France?");
+// Generate response (async)
+const response = await llm.sendMessage("What is the capital of France?");
 console.log(response);
 // Clean up
@@ -113,14 +114,15 @@ llm.sendMessageAsync("Tell me a story", (token, done) => {
 ### Multimodal (Image/Audio)
 ```typescript
-// Image input (for vision models)
-const response = llm.sendMessageWithImage(
+// Image input (for vision models like Gemma 3n)
+// ⚠️ Ensure model is loaded with { maxTokens: 1024+ }
+const response = await llm.sendMessageWithImage(
   "What's in this image?",
   "/path/to/image.jpg",
 );
 // Audio input (for audio models)
-const transcription = llm.sendMessageWithAudio(
+const transcription = await llm.sendMessageWithAudio(
   "Transcribe this audio",
   "/path/to/audio.wav",
 );
@@ -152,7 +154,7 @@ Download `.litertlm` models from [HuggingFace](https://huggingface.co/litert-com
 Creates a new LLM inference engine instance.
-### `loadModel(path, config?)`
+### `loadModel(path, config?): Promise<void>`
 - `path: string` - Absolute path to `.litertlm` file
 - `config.backend` - `'cpu'` | `'gpu'` | `'npu'` (default: `'gpu'`)
@@ -172,19 +174,19 @@ Creates a new LLM inference engine instance.
 > ⚠️ **NPU Note**: NPU acceleration requires compatible hardware (Qualcomm Hexagon, MediaTek APU, etc.). If unavailable, LiteRT-LM automatically falls back to GPU.
-### `sendMessage(message): string`
+### `sendMessage(message): Promise<string>`
-Blocking generation. Returns complete response.
+Blocking generation (executed on background thread). Returns complete response.
 ### `sendMessageAsync(message, callback)`
 Streaming generation. Callback receives `(token, isDone)`.
-### `sendMessageWithImage(message, imagePath): string`
+### `sendMessageWithImage(message, imagePath): Promise<string>`
 Send a message with an image attachment (for vision models).
-### `sendMessageWithAudio(message, audioPath): string`
+### `sendMessageWithAudio(message, audioPath): Promise<string>`
 Send a message with an audio attachment (for audio models).

package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt CHANGED Viewed

@@ -19,8 +19,12 @@ import com.margelo.nitro.dev.litert.litertlm.HybridLiteRTLMSpec
 import com.margelo.nitro.dev.litert.litertlm.LLMConfig
 import com.margelo.nitro.dev.litert.litertlm.Message
 import com.margelo.nitro.dev.litert.litertlm.Role
+import com.margelo.nitro.core.Promise
+import com.google.ai.edge.litertlm.Content
 // Alias to avoid confusion with our generated Message type
+// Alias to avoid confusion
 typealias LiteRTMessage = com.google.ai.edge.litertlm.Message
 /**
@@ -35,6 +39,10 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
         private const val TAG = "HybridLiteRTLM"
     }
+    init {
+        LiteRTLMRegistry.register(this)
+    }
     // LiteRT-LM Engine and Conversation
     private var engine: Engine? = null
     private var conversation: Conversation? = null
@@ -60,116 +68,124 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
     private var maxTokens: Int = 1024
     override val memorySize: Long
-        get() = 10L * 1024L * 1024L // ~10MB estimate
+        get() = 1024L * 1024L * 1024L // ~1GB (models are large)
     // -------------------------------------------------------------------------
     // loadModel - Initialize LiteRT-LM Engine and Conversation
     // -------------------------------------------------------------------------
-    override fun loadModel(modelPath: String, config: LLMConfig?) {
-        Log.i(TAG, "loadModel: $modelPath")
-        // Clean up existing resources
-        close()
-        // Apply configuration
-        config?.let { cfg ->
-            cfg.backend?.let { backend = it }
-            cfg.temperature?.let { temperature = it }
-            cfg.topK?.let { topK = it.toInt() }
-            cfg.topP?.let { topP = it }
-            cfg.maxTokens?.let { maxTokens = it.toInt() }
-        }
+    override fun loadModel(modelPath: String, config: LLMConfig?): Promise<Unit> {
+        return Promise.parallel {
+            Log.i(TAG, "loadModel: $modelPath")
+            // Clean up existing resources
+            close()
+            // Apply configuration
+            config?.let { cfg ->
+                cfg.backend?.let { backend = it }
+                cfg.temperature?.let { temperature = it }
+                cfg.topK?.let { topK = it.toInt() }
+                cfg.topP?.let { topP = it }
+                cfg.maxTokens?.let { maxTokens = it.toInt() }
+            }
-        try {
-            // Map our Backend enum to LiteRT-LM Backend enum
-            val lmBackend = when (backend) {
-                Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
-                Backend.NPU -> {
-                    Log.i(TAG, "NPU backend requested - requires hardware support")
-                    com.google.ai.edge.litertlm.Backend.NPU
+            try {
+                // Map our Backend enum to LiteRT-LM Backend enum
+                val lmBackend = when (backend) {
+                    Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
+                    Backend.NPU -> {
+                        Log.i(TAG, "NPU backend requested - requires hardware support")
+                        com.google.ai.edge.litertlm.Backend.NPU
+                    }
+                    else -> com.google.ai.edge.litertlm.Backend.CPU
                 }
-                else -> com.google.ai.edge.litertlm.Backend.CPU
-            }
-            // Vision backend: hardcoded to GPU (required by Gemma 3n)
-            val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
-            // Audio backend: hardcoded to CPU (optimal for audio processing)
-            val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
-            Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
-            // Get cache directory from application context
-            // LiteRT-LM needs this to store temporary compiled model files
-            val cacheDirectory = LiteRTLMInitProvider.applicationContext?.cacheDir?.absolutePath
-            Log.i(TAG, "Using cache directory: $cacheDirectory")
-            // Create Engine configuration
-            val engineConfig = EngineConfig(
-                modelPath = modelPath,
-                backend = lmBackend,
-                visionBackend = lmVisionBackend,
-                audioBackend = lmAudioBackend,
-                maxNumTokens = maxTokens,
-                cacheDir = cacheDirectory
-            )
-            // Create Engine (heavyweight - loads model)
-            engine = Engine(engineConfig).also { it.initialize() }
-            Log.i(TAG, "Engine created and initialized successfully")
-            // Create Conversation (lightweight - holds KV cache)
-            createNewConversation()
-            Log.i(TAG, "Conversation created successfully")
-        } catch (e: Exception) {
-            Log.e(TAG, "Failed to load model: ${e.message}", e)
-            throw RuntimeException("Failed to load model: ${e.message}", e)
+                // Vision backend: hardcoded to GPU (required by Gemma 3n)
+                val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
+                // Audio backend: hardcoded to CPU (optimal for audio processing)
+                val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
+                Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
+                // Get cache directory from application context
+                val cacheDirectory = LiteRTLMInitProvider.applicationContext?.cacheDir?.absolutePath
+                Log.i(TAG, "Using cache directory: $cacheDirectory")
+                // Create Engine configuration
+                val engineConfig = EngineConfig(
+                    modelPath = modelPath,
+                    backend = lmBackend,
+                    visionBackend = lmVisionBackend,
+                    audioBackend = lmAudioBackend,
+                    maxNumTokens = maxTokens,
+                    cacheDir = cacheDirectory
+                )
+                // Initialize Engine
+                engine = Engine(engineConfig).also { it.initialize() }
+                Log.i(TAG, "Engine created and initialized successfully")
+                // Create Conversation
+                createNewConversation()
+                Log.i(TAG, "Conversation created successfully")
+            } catch (e: Exception) {
+                Log.e(TAG, "Failed to load model: ${e.message}", e)
+                throw RuntimeException("Failed to load model: ${e.message}", e)
+            }
         }
     }
     // -------------------------------------------------------------------------
-    // sendMessage - Blocking text inference
+    // sendMessage - Helper for one-shot generation (internally uses Async)
     // -------------------------------------------------------------------------
-    override fun sendMessage(message: String): String {
-        ensureLoaded()
-        // Add user message to history
-        history.add(Message(Role.USER, message))
-        // Pre-process message (chat template)
-        Log.i(TAG, "sendMessage: $message")
-        // Blocking inference
-        // LiteRT-LM expects a Message object, not String
-        val userMsg = LiteRTMessage.of(message)
-        val responseMsg = conversation!!.sendMessage(userMsg)
-        // Extract text from response Message
-        val response = responseMsg.contents
-            .filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
-            .joinToString("") { it.text }
-        // Add model response to history
-        history.add(Message(Role.MODEL, response))
-        // Update stats (mock/approximate for now as SDK doesn't return full stats for sync call)
-        lastStats = GenerationStats(
-            promptTokens = message.length / 4.0,
-            completionTokens = response.length / 4.0,
-            totalTokens = (message.length + response.length) / 4.0,
-            timeToFirstToken = 0.0,
-            totalTime = 0.0,
-            tokensPerSecond = 0.0
-        )
-        return response
+    override fun sendMessage(message: String): Promise<String> {
+        // Implement Promise-based sendMessage using suspend coroutine logic wrapped in Promise
+        // Since Promise.parallel expects a blocking block returning T,
+        // and sendMessageAsync is callback-based, we need to bridge them.
+        // HOWEVER, we can just use the synchronous `sendMessage` API of the SDK
+        // inside the `Promise.parallel` block, which moves it off the main thread!
+        return Promise.parallel {
+            ensureLoaded()
+            // Add user message to history
+            history.add(Message(Role.USER, message))
+            Log.i(TAG, "sendMessage (Promise): $message")
+            // Blocking inference (safe here because we are in Promise.parallel worker thread)
+            val userMsg = LiteRTMessage.of(message)
+            val responseMsg = conversation!!.sendMessage(userMsg)
+            // Extract text
+            val response = responseMsg.contents
+                .filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
+                .joinToString("") { it.text }
+            // Add model response to history
+            history.add(Message(Role.MODEL, response))
+            // Update stats
+            lastStats = GenerationStats(
+                promptTokens = message.length / 4.0,
+                completionTokens = response.length / 4.0,
+                totalTokens = (message.length + response.length) / 4.0,
+                timeToFirstToken = 0.0,
+                totalTime = 0.0,
+                tokensPerSecond = 0.0
+            )
+            response // Return the string
+        }
     }
     // -------------------------------------------------------------------------
     // sendMessageAsync - Streaming inference
     // -------------------------------------------------------------------------
     override fun sendMessageAsync(message: String, onToken: (String, Boolean) -> Unit) {
+        // This is already async (void return), so we execute immediately on the calling thread
+        // (which is the Nitro specialized thread, not Main).
+        // The SDK's sendMessageAsync is non-blocking anyway.
         ensureLoaded()
         // Add user message to history
@@ -206,12 +222,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
         }
         try {
-            // Construct Message object
             val userMsg = LiteRTMessage.of(message)
-            // LiteRT-LM async call - SDK handles threading
             conversation!!.sendMessageAsync(userMsg, listener)
         } catch (e: Exception) {
             Log.e(TAG, "Failed into initiate async generation", e)
             onToken("Error: ${e.message}", true)
@@ -221,14 +233,63 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
     // -------------------------------------------------------------------------
     // Multimodal methods
     // -------------------------------------------------------------------------
-    override fun sendMessageWithImage(message: String, imagePath: String): String {
-        // TODO: Implement image loading from path
-        throw RuntimeException("Multimodal (Image) not yet implemented in this wrapper")
+    override fun sendMessageWithImage(message: String, imagePath: String): Promise<String> {
+        return Promise.parallel {
+            ensureLoaded()
+            Log.i(TAG, "sendMessageWithImage: $message, path=$imagePath")
+            // Create multimodal message
+            // Use factory method Message.of passing a list of Content
+            val textContent = Content.Text(message)
+            val contentList = listOf(
+                textContent,
+                Content.ImageFile(imagePath)
+            )
+            val userMsg = LiteRTMessage.of(contentList)
+            // Add to history
+            history.add(Message(Role.USER, "$message [Image]"))
+            val responseMsg = conversation!!.sendMessage(userMsg)
+            val response = responseMsg.contents
+                .filterIsInstance<Content.Text>()
+                .joinToString("") { it.text }
+            history.add(Message(Role.MODEL, response))
+            response
+        }
     }
-    override fun sendMessageWithAudio(message: String, audioPath: String): String {
-        // TODO: Implement audio loading from path
-        throw RuntimeException("Multimodal (Audio) not yet implemented in this wrapper")
+    override fun sendMessageWithAudio(message: String, audioPath: String): Promise<String> {
+        return Promise.parallel {
+            ensureLoaded()
+            Log.i(TAG, "sendMessageWithAudio: $message, path=$audioPath")
+            // Load audio
+            val contentList = listOf(
+                Content.Text(message),
+                Content.AudioFile(audioPath)
+            )
+            val userMsg = LiteRTMessage.of(contentList)
+            history.add(Message(Role.USER, "$message [Audio]"))
+            val responseMsg = conversation!!.sendMessage(userMsg)
+            val response = responseMsg.contents
+                .filterIsInstance<Content.Text>()
+                .joinToString("") { it.text }
+            history.add(Message(Role.MODEL, response))
+            response
+        }
     }
     // -------------------------------------------------------------------------
@@ -277,4 +338,6 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
         // Dispose old conversation if needed
         conversation = engine!!.createConversation()
     }
 }

package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/LiteRTLMRegistry.kt ADDED Viewed

@@ -0,0 +1,32 @@
+package com.margelo.nitro.dev.litert.litertlm
+import java.util.Collections
+import java.util.WeakHashMap
+import android.util.Log
+/**
+ * Global registry to track active LiteRTLM instances.
+ * Used for memory trimming and cleanup.
+ */
+object LiteRTLMRegistry {
+    private const val TAG = "LiteRTLMRegistry"
+    // Use WeakSet-like structure to prevent leaks
+    private val instances = Collections.newSetFromMap(WeakHashMap<HybridLiteRTLM, Boolean>())
+    fun register(instance: HybridLiteRTLM) {
+        synchronized(instances) {
+            instances.add(instance)
+        }
+    }
+    fun onTrimMemory(level: Int) {
+        Log.w(TAG, "Received memory warning (level=$level). Releasing resources...")
+        synchronized(instances) {
+            instances.forEach { it.close() }
+            // Note: We don't clear the set here, as close() should be idempotent
+            // and the instance might still be ref-counted by JS.
+            // We just ensure the HEAVY native resources are gone.
+        }
+    }
+}

package/android/src/main/java/dev/litert/litertlm/LiteRTLMInitProvider.kt CHANGED Viewed

@@ -17,6 +17,20 @@ class LiteRTLMInitProvider : ContentProvider() {
     override fun onCreate(): Boolean {
         applicationContext = context?.applicationContext
         Log.i(TAG, "LiteRTLMInitProvider initialized with context: $applicationContext")
+        applicationContext?.registerComponentCallbacks(object : android.content.ComponentCallbacks2 {
+            override fun onTrimMemory(level: Int) {
+                if (level >= android.content.ComponentCallbacks2.TRIM_MEMORY_RUNNING_LOW) {
+                    com.margelo.nitro.dev.litert.litertlm.LiteRTLMRegistry.onTrimMemory(level)
+                }
+            }
+            override fun onConfigurationChanged(newConfig: android.content.res.Configuration) {}
+            override fun onLowMemory() {
+                com.margelo.nitro.dev.litert.litertlm.LiteRTLMRegistry.onTrimMemory(android.content.ComponentCallbacks2.TRIM_MEMORY_COMPLETE)
+            }
+        })
         return true
     }

package/cpp/HybridLiteRTLM.cpp CHANGED Viewed

@@ -11,9 +11,13 @@
 #include "HybridLiteRTLM.hpp"
+#define STB_IMAGE_IMPLEMENTATION
+#include "include/stb_image.h"
 #include <chrono>
 #include <stdexcept>
 #include <sstream>
+#include <fstream>
 namespace margelo::nitro::litertlm {
@@ -229,32 +233,46 @@ std::string HybridLiteRTLM::sendMessageWithImage(
   ensureLoaded();
 #ifdef LITERT_LM_ENABLED
-  // TODO: Load image file into raw pixel buffer
-  // The Engine expects raw RGBA/RGB data, not a file path.
-  // Implementation should:
-  // 1. Read image file (using stb_image.h or Android Bitmap JNI)
-  // 2. Decode to raw pixel buffer (std::vector<uint8_t>)
-  // 3. Create litert::lm::ImageData or equivalent tensor
-  // 4. Pass to conversation_->SendMessage with multimodal content
-  // For now, fall back to text-only with a note about the image
-  std::string augmentedMessage = message + " [Image attached: " + imagePath +
-    " - Note: Image processing not yet implemented, text-only response]";
+  // Load image using stb_image
+  int width, height, channels;
+  unsigned char* img = stbi_load(imagePath.c_str(), &width, &height, &channels, 3); // Force 3 channels (RGB)
+  if (img == nullptr) {
+    throw std::runtime_error("Failed to load image from path: " + imagePath);
+  }
+  // Create input tensor/buffer for the engine.
+  // Note: The exact API for passing image data depends on the LiteRT-LM version.
+  // Assuming a structure that accepts raw bytes and dimensions.
   litert::lm::UserMessage lm_message;
   lm_message.role = "user";
-  lm_message.content = augmentedMessage;
+  // Construct multimodal content
+  // Option A: If UserMessage supports a list of content parts
+  litert::lm::ContentPart textPart;
+  textPart.type = litert::lm::ContentType::TEXT;
+  textPart.text = message;
+  lm_message.parts.push_back(textPart);
+  litert::lm::ContentPart imagePart;
+  imagePart.type = litert::lm::ContentType::IMAGE;
+  imagePart.image.width = width;
+  imagePart.image.height = height;
+  imagePart.image.channels = channels;
+  imagePart.image.data = std::vector<uint8_t>(img, img + (width * height * channels));
+  lm_message.parts.push_back(imagePart);
+  stbi_image_free(img);
   auto response = conversation_->SendMessage(lm_message);
   if (!response.ok()) {
     throw std::runtime_error("Multimodal inference failed: " +
         std::string(response.status().message()));
   }
-  // Add to history
+  // Add to history (metadata only)
   Message userMessage;
   userMessage.role = Role::USER;
-  userMessage.content = message + " [with image]";
+  userMessage.content = message + " [Image]";
   history_.push_back(userMessage);
   Message modelMessage;
@@ -266,6 +284,11 @@ std::string HybridLiteRTLM::sendMessageWithImage(
 #else
   // Stub: just process text with image path noted
+  // Verify file exists at least
+  std::ifstream f(imagePath.c_str());
+  if (!f.good()) {
+     // Don't crash, just log/stub
+  }
   return sendMessage(message + " [Image: " + imagePath + "]");
 #endif
 }
@@ -281,31 +304,41 @@ std::string HybridLiteRTLM::sendMessageWithAudio(
   ensureLoaded();
 #ifdef LITERT_LM_ENABLED
-  // TODO: Load audio file into raw sample buffer
-  // Similar to image - Engine expects raw audio samples, not file path.
-  // Implementation should:
-  // 1. Read WAV file header and samples
-  // 2. Convert to expected format (likely 16kHz mono float32)
-  // 3. Create litert::lm::AudioData or equivalent
-  // 4. Pass to conversation with multimodal content
+  // Load audio file
+  std::ifstream audioFile(audioPath, std::ios::binary);
+  if (!audioFile) {
+      throw std::runtime_error("Failed to open audio file: " + audioPath);
+  }
-  std::string augmentedMessage = message + " [Audio attached: " + audioPath +
-    " - Note: Audio processing not yet implemented, text-only response]";
+  // Simple WAV header skip (simplistic, assuming standard header size for now or raw)
+  // Ideally use a WAV parsing library or miniaudio if available.
+  // For this implementation, we read the whole file.
+  std::vector<uint8_t> audioData((std::istreambuf_iterator<char>(audioFile)), std::istreambuf_iterator<char>());
   litert::lm::UserMessage lm_message;
   lm_message.role = "user";
-  lm_message.content = augmentedMessage;
+  litert::lm::ContentPart textPart;
+  textPart.type = litert::lm::ContentType::TEXT;
+  textPart.text = message;
+  lm_message.parts.push_back(textPart);
+  litert::lm::ContentPart audioPart;
+  audioPart.type = litert::lm::ContentType::AUDIO;
+  audioPart.audio.data = audioData;
+  // Metadata like sample rate might be needed:
+  // audioPart.audio.sample_rate = 16000;
+  lm_message.parts.push_back(audioPart);
   auto response = conversation_->SendMessage(lm_message);
   if (!response.ok()) {
     throw std::runtime_error("Audio inference failed: " +
         std::string(response.status().message()));
   }
-  // Add to history
   Message userMessage;
   userMessage.role = Role::USER;
-  userMessage.content = message + " [with audio]";
+  userMessage.content = message + " [Audio]";
   history_.push_back(userMessage);
   Message modelMessage;