npm - react-native-litert-lm - Versions diffs - 0.3.6 → 0.3.7 - Mend

react-native-litert-lm 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +74 -43
package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +169 -13
package/cpp/HybridLiteRTLM.cpp +125 -24
package/cpp/HybridLiteRTLM.hpp +3 -1
package/cpp/include/README.md +9 -11
package/lib/index.d.ts +2 -2
package/lib/index.js +11 -2
package/package.json +1 -1
package/src/index.ts +12 -2

package/README.md CHANGED Viewed

@@ -6,7 +6,7 @@ High-performance on-device LLM inference for React Native, powered by [LiteRT-LM
 - 🚀 **Native Performance** — Kotlin (Android) / C++ (iOS) via Nitro Modules JSI bindings
 - 🧠 **Gemma 4 Ready** — First-class support for Gemma 4 E2B/E4B multimodal models (text + vision + audio)
-- ⚡ **GPU Acceleration** — GPU delegate (Android), Metal/MPS (iOS)
+- ⚡ **GPU Acceleration** — Metal (iOS), OpenCL GPU delegate (Android, Pixel devices)
 - 🔄 **Streaming Support** — Token-by-token generation callbacks
 - 📱 **Cross-Platform** — Android API 26+ / iOS 15.0+
 - 🖼️ **Multimodal** — Image and audio input support
@@ -15,6 +15,12 @@ High-performance on-device LLM inference for React Native, powered by [LiteRT-LM
 - 🧮 **Zero-Copy Buffers** — Memory snapshots stored in native ArrayBuffers via Nitro Modules
 - 📥 **Automatic Model Download** — Downloads models from URL with progress tracking and local caching
+## Demo
+> Gemma 4 E2B running on-device on a Samsung Galaxy S22 (Snapdragon 8 Gen 1, 4 GB RAM) — CPU backend, streaming inference.
+<video src="https://github.com/user-attachments/assets/1da527ce-0432-4f8b-8899-474f81b2feea" width="300" controls></video>
 ## Installation
 ```bash
@@ -94,35 +100,38 @@ The `example/` directory contains a fully functional test app with a dark-themed
 ## Model Management
-LiteRT-LM models (like Gemma 4) are large files (2–4 GB) and cannot be bundled into your app binary. They are downloaded at runtime.
+LiteRT-LM models (like Gemma 4) are large files (1–4 GB) and cannot be bundled into your app binary. They are downloaded at runtime.
 ### Automatic Downloading
-The library handles downloading automatically when you pass a URL to `loadModel` or `useModel`. Downloads include:
+Pass an HTTPS URL to `useModel()` or `loadModel()` — the library handles the rest:
 - **Progress tracking** — real-time download percentage via callbacks
 - **Local caching** — downloaded models are cached and reused across app launches
-  - **Android**: app-local temp directory
+  - **Android**: `files/models/` (app-private)
   - **iOS**: `Library/Caches/litert_models/` (survives app relaunch; reclaimable by iOS under storage pressure)
 - **HTTPS enforcement** — only secure URLs are accepted
-### Manual Downloading (Optional)
+### Manual Downloading
-If you prefer to manage downloads yourself (e.g., using `expo-file-system`), download the `.litertlm` file to a local path and pass that path to the library:
+If you need custom control over downloads (e.g., authentication headers for private model hosting, resumable downloads, or custom caching), use your preferred HTTP client and pass the local file path:
 ```typescript
-import * as FileSystem from "expo-file-system";
-import { GEMMA_4_E2B_IT } from "react-native-litert-lm";
+import { fetch } from "expo/fetch";
+import { File, Paths } from "expo-file-system";
+import { useModel } from "react-native-litert-lm";
-const localPath = `${FileSystem.documentDirectory}gemma-4-E2B-it.litertlm`;
+const MODEL_URL = "https://example.com/private-model.litertlm";
-async function downloadModel() {
-  const info = await FileSystem.getInfoAsync(localPath);
-  if (info.exists) return localPath;
+// Download with custom headers using expo/fetch
+const response = await fetch(MODEL_URL, {
+  headers: { Authorization: `Bearer ${token}` },
+});
+const modelFile = new File(Paths.cache, "my-model.litertlm");
+modelFile.write(await response.bytes());
-  await FileSystem.downloadAsync(GEMMA_4_E2B_IT, localPath);
-  return localPath;
-}
+// Pass the local path — no download occurs
+const { model, isReady } = useModel(modelFile.uri, { backend: "cpu" });
 ```
 ## Usage
@@ -307,19 +316,19 @@ const buffer = tracker.getNativeBuffer();
 ## Supported Models
-Download `.litertlm` models automatically using the exported URL constants, or manually from [HuggingFace](https://huggingface.co/litert-community):
+All exported model URLs are **public — no authentication required**. Pass them directly to `useModel()` or `loadModel()` for automatic downloading with progress tracking and local caching.
-| Constant               | Model                           | Size    | Min RAM | Auth Required  |
-| :--------------------- | :------------------------------ | :------ | :------ | :------------- |
-| `GEMMA_4_E2B_IT`       | Gemma 4 E2B (Multimodal, IT)    | 2.58 GB | 4 GB+   | ❌ No          |
-| `GEMMA_4_E4B_IT`       | Gemma 4 E4B (Higher Quality)    | 3.65 GB | 6 GB+   | ❌ No          |
-| `GEMMA_3N_E2B_IT_INT4` | Gemma 3n E2B (Int4, Multimodal) | ~1.3 GB | 4 GB+   | ✅ HuggingFace |
+| Constant               | Model                           | Size    | Min RAM | Source      |
+| :--------------------- | :------------------------------ | :------ | :------ | :---------- |
+| `GEMMA_4_E2B_IT`       | Gemma 4 E2B (Multimodal, IT)    | 2.58 GB | 4 GB+   | HuggingFace |
+| `GEMMA_4_E4B_IT`       | Gemma 4 E4B (Higher Quality)    | 3.65 GB | 6 GB+   | HuggingFace |
+| `GEMMA_3N_E2B_IT_INT4` | Gemma 3n E2B (Int4, Multimodal) | ~1.3 GB | 4 GB+   | litert.dev  |
-> **Recommended:** Use `GEMMA_4_E2B_IT` for most use cases. It's multimodal (text + vision + audio) and downloads directly from HuggingFace without requiring an account.
+> **Recommended:** Use `GEMMA_4_E2B_IT` for most use cases — multimodal (text + vision + audio) and the best quality-to-size ratio.
 >
-> **iOS Note:** Models larger than ~2 GB (like Gemma 4) require the `com.apple.developer.kernel.extended-virtual-addressing` entitlement. See [iOS Entitlements](#ios-entitlements) below.
+> **iOS Note:** Models larger than ~2 GB require the `com.apple.developer.kernel.extended-virtual-addressing` entitlement. See [iOS Entitlements](#ios-entitlements) below. Gemma 3n E2B (~1.3 GB) works without it.
-**Other compatible models** (download manually from HuggingFace):
+**Other compatible models** (download `.litertlm` files manually from [HuggingFace](https://huggingface.co/litert-community)):
 | Model         | Size    | Min RAM | Notes                 |
 | ------------- | ------- | ------- | --------------------- |
@@ -352,13 +361,15 @@ Loads a model from a local path or HTTPS URL.
 #### Backend Options
-| Backend | Engine              | Speed   | Notes                                          |
-| ------- | ------------------- | ------- | ---------------------------------------------- |
-| `'cpu'` | CPU inference       | Slowest | Always available, lower RAM requirement        |
-| `'gpu'` | GPU / Metal         | Fast    | Recommended default                            |
-| `'npu'` | NPU / Neural Engine | Fastest | Requires supported hardware; falls back to GPU |
+| Backend | Engine                         | Speed   | Notes                                                                              |
+| ------- | ------------------------------ | ------- | ---------------------------------------------------------------------------------- |
+| `'cpu'` | CPU inference                  | Slowest | Always available on all devices                                                    |
+| `'gpu'` | Metal (iOS) / OpenCL (Android) | Fast    | iOS: always available. Android: requires OpenCL (Pixel only, not Samsung/Qualcomm) |
+| `'npu'` | NPU / Neural Engine            | Fastest | Requires supported hardware; experimental                                          |
-> **iOS**: `'cpu'` is the recommended default backend. `'gpu'` (Metal/MPS) is also supported. The engine automatically tries multiple backend combinations if the primary one fails.
+> **iOS**: Both `'cpu'` and `'gpu'` (Metal) are supported. The engine automatically tries fallback backend combinations if the primary one fails.
+>
+> **Android GPU**: The GPU backend requires OpenCL, which is **not available on most Samsung and Qualcomm devices**. Use `checkBackendSupport('gpu')` to check before loading. The engine will throw a clear error if GPU is unsupported.
 ### `sendMessage(message): Promise<string>`
@@ -383,14 +394,16 @@ Returns performance metrics from the last inference call.
 ```typescript
 interface GenerationStats {
   tokensPerSecond: number;
-  totalTime: number; // seconds
-  timeToFirstToken: number; // seconds
+  totalTime: number; // milliseconds
+  timeToFirstToken: number; // milliseconds
   promptTokens: number;
   completionTokens: number;
-  prefillSpeed: number; // tokens/sec
+  totalTokens: number;
 }
 ```
+> **Note**: Stats are available for both sync (`sendMessage`) and streaming (`sendMessageAsync`) on both platforms. iOS uses real benchmark data from the C API; Android uses heuristic token counts (~4 chars/token) with precise timing.
 ### `getMemoryUsage(): MemoryUsage`
 Returns real OS-level memory usage.
@@ -432,10 +445,21 @@ import {
   applyLlamaTemplate,
 } from "react-native-litert-lm";
-// Check if a backend is supported
-const warning = checkBackendSupport("npu"); // string | undefined
+// Check if GPU is supported on this device
+const gpuWarning = checkBackendSupport("gpu");
+if (gpuWarning) {
+  console.warn(gpuWarning);
+  // "GPU backend requires OpenCL support, which is unavailable on most Samsung and Qualcomm devices."
+}
+// Check NPU support
+const npuWarning = checkBackendSupport("npu"); // string | undefined
+// Check multimodal support
 const mmError = checkMultimodalSupport(); // string | undefined
-const backend = getRecommendedBackend(); // 'gpu' | 'cpu'
+// Get recommended backend
+const backend = getRecommendedBackend(); // 'cpu'
 // Manual prompt formatting (advanced)
 const prompt = applyGemmaTemplate(
@@ -456,10 +480,10 @@ const prompt = applyGemmaTemplate(
 ## Platform Support
-| Platform | Status   | Architecture | Backends         |
-| -------- | -------- | ------------ | ---------------- |
-| Android  | ✅ Ready | arm64-v8a    | CPU, GPU, NPU    |
-| iOS      | ✅ Ready | arm64        | CPU, GPU (Metal) |
+| Platform | Status   | Architecture | Backends                                          |
+| -------- | -------- | ------------ | ------------------------------------------------- |
+| Android  | ✅ Ready | arm64-v8a    | CPU (all devices), GPU (OpenCL devices only), NPU |
+| iOS      | ✅ Ready | arm64        | CPU, GPU (Metal — always available)               |
 ### iOS Feature Matrix
@@ -552,13 +576,20 @@ Additionally, `PromptTemplate` is patched at build time to use a simplified C++
 ├──────────────────────┬──────────────────────────┤
 │  Android (Kotlin)    │  iOS (C++)               │
 │  HybridLiteRTLM.kt   │  HybridLiteRTLM.cpp      │
-│  litertlm-android    │  LiteRTLM C API          │
+│  litertlm-android    │  LiteRT-LM C API         │
 │  AAR (GPU delegate)  │  XCFramework (Metal)     │
 └──────────────────────┴──────────────────────────┘
 ```
-- **Android**: Kotlin (`HybridLiteRTLM.kt`) interfacing with the `litertlm-android` AAR.
-- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM C API via a prebuilt `LiteRTLM.xcframework`. All engine operations (load, inference, streaming) run on dedicated `pthread` threads with 8 MB stack to accommodate XNNPack's stack requirements. Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
+- **Android**: Kotlin (`HybridLiteRTLM.kt`) interfacing with the `litertlm-android` AAR via the **Kotlin SDK**. The SDK handles control token stripping and turn management automatically. Engine validation probes for OpenCL availability before GPU initialization. `ConversationConfig` with `SamplerConfig` is passed for all conversations (matching the Gallery app pattern).
+- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM **C API** via a prebuilt `LiteRTLM.xcframework`. Unlike the Kotlin SDK, the C API emits raw tokens including control sequences (`<end_of_turn>`, `<start_of_turn>`) and echoed user messages. The C++ layer implements a robust sanitization pipeline:
+  - **Accumulation-and-diff** — buffers the full response and emits only new deltas
+  - **`stripControlTokens()`** — removes all control sequences from the accumulated buffer
+  - **`safeEmitLength()`** — look-ahead buffering that withholds partial control tokens (e.g., `<end_of_tur`) from emission until the full token is received or the stream terminates
+  - **Echo mitigation** — strips echoed user messages from the raw stream
+  - **Final flush** — mandatory clean-and-flush step on stream termination
+  Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
 > **For contributors**: Changes to `cpp/HybridLiteRTLM.cpp` do not affect Android. Feature changes must be applied to both the Kotlin and C++ implementations.

package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt CHANGED Viewed

@@ -17,6 +17,7 @@ import com.google.ai.edge.litertlm.Engine
 import com.google.ai.edge.litertlm.Conversation
 import com.google.ai.edge.litertlm.EngineConfig
 import com.google.ai.edge.litertlm.ConversationConfig
+import com.google.ai.edge.litertlm.SamplerConfig
 import com.margelo.nitro.dev.litert.litertlm.Backend
 import com.margelo.nitro.dev.litert.litertlm.GenerationStats
 import com.margelo.nitro.dev.litert.litertlm.HybridLiteRTLMSpec
@@ -25,6 +26,11 @@ import com.margelo.nitro.dev.litert.litertlm.Message
 import com.margelo.nitro.dev.litert.litertlm.Role
 import com.margelo.nitro.core.Promise
 import com.google.ai.edge.litertlm.Content
+import com.google.ai.edge.litertlm.Contents
+import java.util.concurrent.CountDownLatch
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicBoolean
+import java.util.concurrent.atomic.AtomicReference
 // Alias to avoid confusion with our generated Message type
@@ -42,13 +48,26 @@ internal class StreamingCallbackListener(
     private val onToken: (String, Boolean) -> Unit,
     private val responseBuilder: StringBuilder,
     private val history: MutableList<Message>,
+    private val userMessage: String,
+    private val onStatsReady: (GenerationStats) -> Unit,
 ) : com.google.ai.edge.litertlm.MessageCallback {
-    override fun onMessage(responseMsg: com.google.ai.edge.litertlm.Message) {
-        val chunk = responseMsg.contents.contents
+    private val startTime = System.nanoTime()
+    private var firstTokenTime = 0L
+    private var tokenCount = 0
+    override fun onMessage(message: com.google.ai.edge.litertlm.Message) {
+        val chunk = message.contents.contents
             .filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
             .joinToString("") { it.text }
+        if (firstTokenTime == 0L && chunk.isNotEmpty()) {
+            firstTokenTime = System.nanoTime()
+        }
+        if (chunk.isNotEmpty()) {
+            tokenCount++
+        }
         onToken(chunk, false)
         if (chunk.isNotEmpty()) {
@@ -60,12 +79,27 @@ internal class StreamingCallbackListener(
         onToken("", true)
         val fullResponse = responseBuilder.toString()
         history.add(Message(Role.MODEL, fullResponse))
-        Log.d("StreamingCallbackListener", "Streaming done. Length: ${fullResponse.length}")
+        // Compute stats using heuristic token counts (~4 chars/token)
+        val elapsedMs = (System.nanoTime() - startTime) / 1_000_000.0
+        val ttftMs = if (firstTokenTime > 0) (firstTokenTime - startTime) / 1_000_000.0 else 0.0
+        val promptTokens = userMessage.length / 4.0
+        val completionTokens = fullResponse.length / 4.0
+        onStatsReady(GenerationStats(
+            promptTokens = promptTokens,
+            completionTokens = completionTokens,
+            totalTokens = promptTokens + completionTokens,
+            timeToFirstToken = ttftMs,
+            totalTime = elapsedMs,
+            tokensPerSecond = if (elapsedMs > 0) completionTokens / (elapsedMs / 1000.0) else 0.0
+        ))
+        Log.d("StreamingCallbackListener", "Streaming done. Length: ${fullResponse.length}, TTFT: ${ttftMs.toLong()}ms, Total: ${elapsedMs.toLong()}ms")
     }
-    override fun onError(t: Throwable) {
-        Log.e("StreamingCallbackListener", "Async generation failed", t)
-        onToken("Error: ${t.message}", true)
+    override fun onError(throwable: Throwable) {
+        Log.e("StreamingCallbackListener", "Async generation failed", throwable)
+        onToken("Error: ${throwable.message}", true)
     }
 }
@@ -80,6 +114,10 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
     companion object {
         private const val TAG = "HybridLiteRTLM"
         private val initLock = Any()
+        /** Cached result of OpenCL availability probe (null = not yet checked). */
+        @Volatile
+        private var openCLAvailable: Boolean? = null
         /**
          * Initialize the native library.
@@ -161,6 +199,35 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
                 }
                 try {
+                    // Early GPU hardware check: probe for OpenCL library before
+                    // spending time on engine creation. LiteRT-LM's GPU delegate
+                    // requires OpenCL, which is absent on most Samsung/Qualcomm devices.
+                    if (backend == Backend.GPU) {
+                        val hasOpenCL = openCLAvailable ?: run {
+                            val result = try {
+                                System.loadLibrary("OpenCL")
+                                true
+                            } catch (_: UnsatisfiedLinkError) {
+                                try {
+                                    // Some devices have it at a non-standard path
+                                    System.load("/system/vendor/lib64/libOpenCL.so")
+                                    true
+                                } catch (_: UnsatisfiedLinkError) {
+                                    false
+                                }
+                            }
+                            openCLAvailable = result
+                            result
+                        }
+                        if (!hasOpenCL) {
+                            throw RuntimeException(
+                                "GPU backend is not supported on this device (OpenCL library not found). " +
+                                "Please use CPU backend instead."
+                            )
+                        }
+                        Log.i(TAG, "OpenCL library found — GPU backend is available")
+                    }
                     // Map our Backend enum to LiteRT-LM Backend sealed class
                     val lmBackend = when (backend) {
                         Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU()
@@ -215,9 +282,15 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
                     // Create Conversation
                     createNewConversation()
                     Log.i(TAG, "Conversation created successfully")
+                    // Validate the engine actually works with a quick test inference.
+                    // GPU backend can initialize without error but silently fail to produce tokens.
+                    validateEngine()
                 } catch (e: Exception) {
                     Log.e(TAG, "Failed to load model: ${e.message}", e)
+                    // Clean up partial state so isReady() returns false
+                    cleanupInternal()
                     throw RuntimeException("Failed to load model: ${e.message}", e)
                 }
             }
@@ -241,7 +314,7 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
             Log.i(TAG, "sendMessage (Promise): $message")
             // Blocking inference (safe here because we are in Promise.parallel worker thread)
-            val userMsg = LiteRTMessage.of(text = message)
+            val userMsg = LiteRTMessage.user(message)
             val startTime = System.nanoTime()
             val responseMsg = conversation!!.sendMessage(message = userMsg)
             val elapsedMs = (System.nanoTime() - startTime) / 1_000_000.0
@@ -292,10 +365,12 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
             onToken = onToken,
             responseBuilder = fullResponseBuilder,
             history = history,
+            userMessage = message,
+            onStatsReady = { stats -> lastStats = stats },
         )
         try {
-            val userMsg = LiteRTMessage.of(text = message)
+            val userMsg = LiteRTMessage.user(message)
             conversation!!.sendMessageAsync(message = userMsg, callback = listener)
         } catch (e: Exception) {
             Log.e(TAG, "Failed to initiate async generation", e)
@@ -359,7 +434,7 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
             // Use factory method Message.of passing a list of Content
             val textContent = Content.Text(message)
-            val userMsg = LiteRTMessage.of(textContent, Content.ImageFile(processedImagePath))
+            val userMsg = LiteRTMessage.user(Contents.of(textContent, Content.ImageFile(processedImagePath)))
             // Add to history
             history.add(Message(Role.USER, "$message [Image]"))
@@ -501,10 +576,10 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
             // Load audio
-            val userMsg = LiteRTMessage.of(
+            val userMsg = LiteRTMessage.user(Contents.of(
                 Content.Text(message),
                 Content.AudioFile(audioPath)
-            )
+            ))
             history.add(Message(Role.USER, "$message [Audio]"))
@@ -641,7 +716,16 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
             }
             conversation = null
         }
-        conversation = engine!!.createConversation()
+        // Create conversation with explicit SamplerConfig (required by Gallery pattern).
+        // GPU backend may fail silently without proper sampler params.
+        val convConfig = ConversationConfig(
+            samplerConfig = SamplerConfig(
+                topK = topK,
+                topP = topP,
+                temperature = temperature,
+            )
+        )
+        conversation = engine!!.createConversation(convConfig)
         // Apply system prompt/instruction if set
         systemPrompt?.let { prompt ->
             if (prompt.isNotEmpty()) {
@@ -649,7 +733,7 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
                     // Send system instruction as the first turn to prime the conversation.
                     // LiteRT-LM's Conversation API handles chat template formatting,
                     // including Gemma's <start_of_turn>system block.
-                    val systemMsg = LiteRTMessage.of(Content.Text(prompt))
+                    val systemMsg = LiteRTMessage.system(prompt)
                     conversation!!.sendMessage(message = systemMsg)
                     Log.i(TAG, "System prompt applied (${prompt.length} chars)")
                 } catch (e: Exception) {
@@ -659,5 +743,77 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
         }
     }
+    /**
+     * Validate that the engine can actually produce inference output.
+     *
+     * Some GPU backends initialize without error but silently hang during inference.
+     * This sends a minimal test prompt ("Hi") and waits up to 30s for any token.
+     * If no token arrives, we throw so the model does NOT appear as loaded.
+     */
+    private fun validateEngine() {
+        val backendName = when (backend) {
+            Backend.GPU -> "GPU"
+            Backend.NPU -> "NPU"
+            else -> "CPU"
+        }
+        Log.i(TAG, "Validating $backendName backend with test inference...")
+        val latch = CountDownLatch(1)
+        val gotToken = AtomicBoolean(false)
+        val errorRef = AtomicReference<String?>(null)
+        // Use the existing conversation for validation (single-session constraint).
+        val validationConv = conversation
+            ?: throw RuntimeException("$backendName backend: no conversation available for validation")
+        try {
+            val testMsg = LiteRTMessage.user("Hi")
+            validationConv.sendMessageAsync(
+                message = testMsg,
+                callback = object : com.google.ai.edge.litertlm.MessageCallback {
+                    override fun onMessage(msg: com.google.ai.edge.litertlm.Message) {
+                        gotToken.set(true)
+                        latch.countDown()
+                    }
+                    override fun onDone() {
+                        latch.countDown()
+                    }
+                    override fun onError(t: Throwable) {
+                        errorRef.set(t.message)
+                        latch.countDown()
+                    }
+                }
+            )
+        } catch (e: Exception) {
+            throw RuntimeException(
+                "$backendName backend failed to run inference: ${e.message}. " +
+                "This device may not support the $backendName backend. Please try CPU.",
+                e
+            )
+        }
+        // Wait up to 30s for any response
+        val completed = latch.await(30, TimeUnit.SECONDS)
+        val error = errorRef.get()
+        if (error != null) {
+            throw RuntimeException(
+                "$backendName backend inference error: $error. " +
+                "This device may not support the $backendName backend. Please try CPU."
+            )
+        }
+        if (!completed || !gotToken.get()) {
+            throw RuntimeException(
+                "$backendName backend produced no response within 30 seconds. " +
+                "This device may not support the $backendName backend. Please try CPU."
+            )
+        }
+        Log.i(TAG, "$backendName backend validated successfully")
+        // Re-create the real conversation (validation consumed one turn)
+        createNewConversation()
+    }
 }

package/cpp/HybridLiteRTLM.cpp CHANGED Viewed

@@ -110,31 +110,67 @@ std::string HybridLiteRTLM::buildAudioMessageJson(const std::string& text, const
 }
 /**
- * Strip Gemma / LiteRT-LM control tokens from model output.
- * The iOS C API returns raw model text including stop/turn markers
- * that the Android Kotlin SDK strips automatically.
+ * Gemma / LiteRT-LM control tokens that the iOS C API includes in raw output.
+ * The Android Kotlin SDK strips these automatically.
+ */
+static const char* kControlTokens[] = {
+  "<end_of_turn>",
+  "<start_of_turn>model",
+  "<start_of_turn>user",
+  "<start_of_turn>",
+  "<eos>",
+};
+/**
+ * Strip control tokens from model output, preserving whitespace.
+ * Streaming tokens like " the", " is" have meaningful leading spaces
+ * that must not be trimmed.
  */
 static std::string stripControlTokens(const std::string& text) {
-  static const char* tokens[] = {
-    "<end_of_turn>",
-    "<start_of_turn>model",
-    "<start_of_turn>user",
-    "<start_of_turn>",
-    "<eos>",
-  };
   std::string result = text;
-  for (auto* tok : tokens) {
+  for (auto* tok : kControlTokens) {
     std::string t(tok);
     size_t pos;
     while ((pos = result.find(t)) != std::string::npos) {
       result.erase(pos, t.length());
     }
   }
-  // Trim leading/trailing whitespace
-  size_t start = result.find_first_not_of(" \t\n\r");
+  return result;
+}
+/**
+ * Determine how many characters from the start of `text` are safe to emit.
+ * If the tail of `text` could be the beginning of a control token (split
+ * across chunk boundaries), those characters are withheld until the next
+ * chunk confirms whether it's a real token or normal content.
+ */
+static size_t safeEmitLength(const std::string& text) {
+  // Find the last '<' — it could be the start of a partial control token
+  size_t lastAngle = text.rfind('<');
+  if (lastAngle == std::string::npos) {
+    return text.length();  // No '<' found, safe to emit all
+  }
+  std::string suffix = text.substr(lastAngle);
+  // Check if this suffix is a prefix of any control token
+  for (auto* tok : kControlTokens) {
+    std::string t(tok);
+    if (suffix.length() < t.length() && t.compare(0, suffix.length(), suffix) == 0) {
+      // This suffix could be the start of a control token — hold it back
+      return lastAngle;
+    }
+  }
+  // The '<' doesn't match any control token prefix, safe to emit all
+  return text.length();
+}
+/** Trim leading/trailing whitespace from a complete response. */
+static std::string trimWhitespace(const std::string& text) {
+  size_t start = text.find_first_not_of(" \t\n\r");
   if (start == std::string::npos) return "";
-  size_t end = result.find_last_not_of(" \t\n\r");
-  return result.substr(start, end - start + 1);
+  size_t end = text.find_last_not_of(" \t\n\r");
+  return text.substr(start, end - start + 1);
 }
 std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonResponse) {
@@ -427,7 +463,7 @@ std::string HybridLiteRTLM::sendMessageInternal(const std::string& message) {
   const char* responseStr = litert_lm_json_response_get_string(response);
   if (responseStr) {
-    result = extractTextFromResponse(std::string(responseStr));
+    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
   }
   litert_lm_json_response_delete(response);
@@ -485,6 +521,26 @@ void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
       ctx->lastStats->tokensPerSecond = (ctx->tokenCount / durationMs) * 1000.0;
     }
+    // Final flush: do one last clean of the full accumulated response
+    // to emit any text that was withheld by safeEmitLength.
+    std::string cleaned = stripControlTokens(ctx->rawResponse);
+    size_t start = cleaned.find_first_not_of(" \t\n\r");
+    if (start != std::string::npos) {
+      cleaned = cleaned.substr(start);
+      // Strip echoed user message
+      if (!ctx->userMessage.empty() && cleaned.find(ctx->userMessage) == 0) {
+        cleaned = cleaned.substr(ctx->userMessage.length());
+        size_t nextStart = cleaned.find_first_not_of(" \t\n\r");
+        cleaned = (nextStart != std::string::npos) ? cleaned.substr(nextStart) : "";
+      }
+      // Emit any remaining text not yet sent
+      if (cleaned.length() > ctx->lastEmittedLength) {
+        std::string remaining = cleaned.substr(ctx->lastEmittedLength);
+        ctx->onToken(remaining, false);
+      }
+      ctx->fullResponse = cleaned;
+    }
     // Update history (thread-safe)
     {
       std::lock_guard<std::mutex> lock(*ctx->historyMutex);
@@ -499,12 +555,55 @@ void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
   if (chunk) {
     std::string token(chunk);
-    // Filter out Gemma control tokens from streamed chunks
-    std::string cleaned = stripControlTokens(token);
-    ctx->fullResponse += cleaned;
-    ctx->tokenCount++;
-    if (!cleaned.empty()) {
-      ctx->onToken(cleaned, false);
+    // The C API may return JSON-wrapped responses (e.g.
+    // {"role":"model","content":[{"type":"text","text":"Hi"}]})
+    // instead of raw text tokens. Detect and extract text content.
+    std::string raw;
+    if (token.size() > 2 && token[0] == '{' && token.find("\"role\"") != std::string::npos) {
+      raw = HybridLiteRTLM::extractTextFromResponse(token);
+    } else {
+      raw = token;
+    }
+    // Accumulate raw text, then strip control tokens from the FULL buffer.
+    // This correctly handles tokens split across chunk boundaries (e.g.
+    // chunk1="<end_of_tu" chunk2="rn>Hello").
+    ctx->rawResponse += raw;
+    std::string cleaned = stripControlTokens(ctx->rawResponse);
+    // Trim leading whitespace from the overall response
+    size_t start = cleaned.find_first_not_of(" \t\n\r");
+    if (start == std::string::npos) {
+      // Still only whitespace/control tokens — nothing to emit yet
+      return;
+    }
+    cleaned = cleaned.substr(start);
+    // The C API may echo back the user's message before the model response.
+    // Strip the echoed user message prefix if present.
+    if (!ctx->userMessage.empty()) {
+      size_t userPos = cleaned.find(ctx->userMessage);
+      if (userPos == 0) {
+        cleaned = cleaned.substr(ctx->userMessage.length());
+        // Trim any whitespace after the stripped user message
+        size_t nextStart = cleaned.find_first_not_of(" \t\n\r");
+        if (nextStart == std::string::npos) {
+          return;  // Only user message so far, nothing to emit
+        }
+        cleaned = cleaned.substr(nextStart);
+      }
+    }
+    // Only emit text that is "safe" — withhold any trailing characters
+    // that could be the start of a control token split across chunks.
+    size_t safe = safeEmitLength(cleaned);
+    if (safe > ctx->lastEmittedLength) {
+      std::string newText = cleaned.substr(ctx->lastEmittedLength, safe - ctx->lastEmittedLength);
+      ctx->fullResponse = cleaned.substr(0, safe);
+      ctx->lastEmittedLength = safe;
+      ctx->tokenCount++;
+      ctx->onToken(newText, false);
     }
   }
 }
@@ -520,7 +619,9 @@ void HybridLiteRTLM::sendMessageAsync(
   // Capture shared state safely — use unique_ptr to prevent leaks
   auto ctxOwner = std::make_unique<StreamContext>();
   ctxOwner->onToken = std::move(onTokenCopy);
+  ctxOwner->rawResponse = "";
   ctxOwner->fullResponse = "";
+  ctxOwner->lastEmittedLength = 0;
   ctxOwner->history = &history_;
   ctxOwner->historyMutex = &mutex_;
   ctxOwner->userMessage = messageCopy;
@@ -602,7 +703,7 @@ std::string HybridLiteRTLM::sendMessageWithImageInternal(
   const char* responseStr = litert_lm_json_response_get_string(response);
   if (responseStr) {
-    result = extractTextFromResponse(std::string(responseStr));
+    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
   }
   litert_lm_json_response_delete(response);
 #else
@@ -662,7 +763,7 @@ std::string HybridLiteRTLM::sendMessageWithAudioInternal(
   const char* responseStr = litert_lm_json_response_get_string(response);
   if (responseStr) {
-    result = extractTextFromResponse(std::string(responseStr));
+    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
   }
   litert_lm_json_response_delete(response);
 #else

package/cpp/HybridLiteRTLM.hpp CHANGED Viewed

@@ -149,7 +149,9 @@ private:
   // Streaming callback context (must be a plain struct for C function pointer)
   struct StreamContext {
     std::function<void(const std::string&, bool)> onToken;
-    std::string fullResponse;
+    std::string rawResponse;     // Raw accumulated chunks (before stripping)
+    std::string fullResponse;    // Clean accumulated text (after stripping)
+    size_t lastEmittedLength;    // Length of fullResponse already emitted to JS
     std::vector<Message>* history;
     std::mutex* historyMutex;
     std::string userMessage;

package/cpp/include/README.md CHANGED Viewed

@@ -1,34 +1,32 @@
 # LiteRT-LM Headers Fallback
-This directory is a fallback location for LiteRT-LM C++ headers when Prefab doesn't expose them correctly.
+This directory contains the LiteRT-LM C API header (`litert_lm_engine.h`) used by the iOS C++ implementation.
 ## If Headers Are Missing
-If you get compilation errors like `litert/lm/engine.h: No such file or directory`, you need to manually copy LiteRT-LM headers here:
+If you get compilation errors like `litert_lm_engine.h: No such file or directory`, you need to manually copy the LiteRT-LM C API header here:
 1. Clone LiteRT-LM repository:
    ```bash
    git clone https://github.com/google-ai-edge/LiteRT-LM.git /tmp/LiteRT-LM
+   cd /tmp/LiteRT-LM && git checkout v0.10.2
    ```
-2. Copy the headers:
+2. Copy the header:
    ```bash
-   cp -r /tmp/LiteRT-LM/runtime/include/litert ./
+   cp /tmp/LiteRT-LM/c/litert_lm_engine.h ./
    ```
 The expected directory structure after copying:
 ```
 cpp/include/
-└── litert/
-    └── lm/
-        ├── engine.h
-        ├── conversation.h
-        ├── types.h
-        └── ...
+├── litert_lm_engine.h   # LiteRT-LM C API header
+├── stb_image.h          # Image loading for multimodal
+└── README.md
 ```
 ## Note
-The ideal scenario is that the Maven package `litertlm-android:0.9.0-alpha01` exposes headers via Prefab, making this directory unnecessary. This is just a fallback.
+On **Android**, headers are provided by the `litertlm-android` AAR via Prefab — this directory is only needed for the **iOS** build which uses the raw C API via the prebuilt XCFramework.

package/lib/index.d.ts CHANGED Viewed

@@ -110,8 +110,8 @@ export declare function checkBackendSupport(backend: Backend): string | undefine
  */
 export declare function checkMultimodalSupport(): string | undefined;
 /**
- * Download URL for the Gemma 3n E2B IT INT4 model.
- * Note: Requires a HuggingFace account (gated model).
+ * Download URL for the Gemma 3n E2B IT INT4 model (~1.3 GB).
+ * Public — hosted on litert.dev, no authentication required.
  */
 export declare const GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
 /**

package/lib/index.js CHANGED Viewed

@@ -116,6 +116,15 @@ function getRecommendedBackend() {
  * ```
  */
 function checkBackendSupport(backend) {
+    if (backend === "gpu") {
+        if (react_native_1.Platform.OS === "android") {
+            // LiteRT-LM GPU delegate requires OpenCL, which is unavailable
+            // on most Samsung/Qualcomm devices. Only Pixel devices reliably expose it.
+            return "GPU backend requires OpenCL support, which is unavailable on most Samsung and Qualcomm devices.";
+        }
+        // iOS always supports GPU via Metal
+        return undefined;
+    }
     if (backend === "npu") {
         if (react_native_1.Platform.OS === "android") {
             return "NPU backend requires compatible hardware (Qualcomm Hexagon, MediaTek APU, etc.). Will fall back to GPU if unavailable.";
@@ -150,8 +159,8 @@ function checkMultimodalSupport() {
     return undefined;
 }
 /**
- * Download URL for the Gemma 3n E2B IT INT4 model.
- * Note: Requires a HuggingFace account (gated model).
+ * Download URL for the Gemma 3n E2B IT INT4 model (~1.3 GB).
+ * Public — hosted on litert.dev, no authentication required.
  */
 exports.GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "react-native-litert-lm",
-  "version": "0.3.6",
+  "version": "0.3.7",
   "litertLm": {
     "version": "0.10.2",
     "androidMavenVersion": "0.10.2",

package/src/index.ts CHANGED Viewed

@@ -132,6 +132,16 @@ export function getRecommendedBackend(): Backend {
  * ```
  */
 export function checkBackendSupport(backend: Backend): string | undefined {
+  if (backend === "gpu") {
+    if (Platform.OS === "android") {
+      // LiteRT-LM GPU delegate requires OpenCL, which is unavailable
+      // on most Samsung/Qualcomm devices. Only Pixel devices reliably expose it.
+      return "GPU backend requires OpenCL support, which is unavailable on most Samsung and Qualcomm devices.";
+    }
+    // iOS always supports GPU via Metal
+    return undefined;
+  }
   if (backend === "npu") {
     if (Platform.OS === "android") {
       return "NPU backend requires compatible hardware (Qualcomm Hexagon, MediaTek APU, etc.). Will fall back to GPU if unavailable.";
@@ -169,8 +179,8 @@ export function checkMultimodalSupport(): string | undefined {
 }
 /**
- * Download URL for the Gemma 3n E2B IT INT4 model.
- * Note: Requires a HuggingFace account (gated model).
+ * Download URL for the Gemma 3n E2B IT INT4 model (~1.3 GB).
+ * Public — hosted on litert.dev, no authentication required.
  */
 export const GEMMA_3N_E2B_IT_INT4 =
   "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";