react-native-litert-lm 0.3.1 โ 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -25
- package/android/build.gradle +6 -2
- package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +26 -30
- package/app.plugin.js +28 -3
- package/cpp/HybridLiteRTLM.cpp +146 -63
- package/cpp/HybridLiteRTLM.hpp +2 -2
- package/lib/hooks.js +4 -0
- package/lib/index.d.ts +19 -2
- package/lib/index.js +24 -7
- package/lib/specs/LiteRTLM.nitro.d.ts +7 -7
- package/package.json +19 -13
- package/scripts/build-ios-engine.sh +1 -1
- package/scripts/download-ios-frameworks.sh +1 -1
- package/src/hooks.ts +5 -0
- package/src/index.ts +27 -6
- package/src/specs/LiteRTLM.nitro.ts +7 -7
package/README.md
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
# react-native-litert-lm
|
|
2
2
|
|
|
3
|
-
High-performance on-device LLM inference for React Native, powered by [LiteRT-LM](https://github.com/google-ai-edge/LiteRT-LM) and [Nitro Modules](https://github.com/mrousavy/nitro). Optimized for **Gemma
|
|
3
|
+
High-performance on-device LLM inference for React Native, powered by [LiteRT-LM](https://github.com/google-ai-edge/LiteRT-LM) and [Nitro Modules](https://github.com/mrousavy/nitro). Optimized for **Gemma 4** and other on-device language models.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
7
|
- ๐ **Native Performance** โ Kotlin (Android) / C++ (iOS) via Nitro Modules JSI bindings
|
|
8
|
-
- ๐ง **Gemma
|
|
8
|
+
- ๐ง **Gemma 4 Ready** โ First-class support for Gemma 4 E2B/E4B multimodal models (text + vision + audio)
|
|
9
9
|
- โก **GPU Acceleration** โ GPU delegate (Android), Metal/MPS (iOS)
|
|
10
10
|
- ๐ **Streaming Support** โ Token-by-token generation callbacks
|
|
11
11
|
- ๐ฑ **Cross-Platform** โ Android API 26+ / iOS 15.0+
|
|
12
|
-
- ๐ผ๏ธ **Multimodal** โ Image and audio input support
|
|
13
|
-
- ๐งต **Async API** โ Non-blocking inference on
|
|
12
|
+
- ๐ผ๏ธ **Multimodal** โ Image and audio input support
|
|
13
|
+
- ๐งต **Async API** โ Non-blocking inference on dedicated large-stack threads
|
|
14
14
|
- ๐ **Real Memory Tracking** โ OS-level memory metrics (RSS, native heap, available memory) via native APIs
|
|
15
15
|
- ๐งฎ **Zero-Copy Buffers** โ Memory snapshots stored in native ArrayBuffers via Nitro Modules
|
|
16
16
|
- ๐ฅ **Automatic Model Download** โ Downloads models from URL with progress tracking and local caching
|
|
@@ -94,7 +94,7 @@ The `example/` directory contains a fully functional test app with a dark-themed
|
|
|
94
94
|
|
|
95
95
|
## Model Management
|
|
96
96
|
|
|
97
|
-
LiteRT-LM models (like Gemma
|
|
97
|
+
LiteRT-LM models (like Gemma 4) are large files (2โ4 GB) and cannot be bundled into your app binary. They are downloaded at runtime.
|
|
98
98
|
|
|
99
99
|
### Automatic Downloading
|
|
100
100
|
|
|
@@ -112,16 +112,15 @@ If you prefer to manage downloads yourself (e.g., using `expo-file-system`), dow
|
|
|
112
112
|
|
|
113
113
|
```typescript
|
|
114
114
|
import * as FileSystem from "expo-file-system";
|
|
115
|
+
import { GEMMA_4_E2B_IT } from "react-native-litert-lm";
|
|
115
116
|
|
|
116
|
-
const
|
|
117
|
-
"https://huggingface.co/litert-community/gemma-3n-2b-it/resolve/main/model.litertlm";
|
|
118
|
-
const localPath = `${FileSystem.documentDirectory}gemma-3n.litertlm`;
|
|
117
|
+
const localPath = `${FileSystem.documentDirectory}gemma-4-E2B-it.litertlm`;
|
|
119
118
|
|
|
120
119
|
async function downloadModel() {
|
|
121
120
|
const info = await FileSystem.getInfoAsync(localPath);
|
|
122
121
|
if (info.exists) return localPath;
|
|
123
122
|
|
|
124
|
-
await FileSystem.downloadAsync(
|
|
123
|
+
await FileSystem.downloadAsync(GEMMA_4_E2B_IT, localPath);
|
|
125
124
|
return localPath;
|
|
126
125
|
}
|
|
127
126
|
```
|
|
@@ -133,7 +132,7 @@ async function downloadModel() {
|
|
|
133
132
|
The `useModel` hook manages the full model lifecycle: downloading, loading, inference, and cleanup.
|
|
134
133
|
|
|
135
134
|
```typescript
|
|
136
|
-
import { useModel,
|
|
135
|
+
import { useModel, GEMMA_4_E2B_IT } from "react-native-litert-lm";
|
|
137
136
|
import { Platform } from "react-native";
|
|
138
137
|
|
|
139
138
|
function App() {
|
|
@@ -145,8 +144,8 @@ function App() {
|
|
|
145
144
|
load, // Manually trigger load
|
|
146
145
|
deleteModel, // Delete cached model file
|
|
147
146
|
memorySummary, // Auto-updated memory stats (if tracking enabled)
|
|
148
|
-
} = useModel(
|
|
149
|
-
backend:
|
|
147
|
+
} = useModel(GEMMA_4_E2B_IT, {
|
|
148
|
+
backend: 'cpu',
|
|
150
149
|
autoLoad: true, // Default: true. Set false to load manually via load().
|
|
151
150
|
systemPrompt: "You are a helpful assistant.",
|
|
152
151
|
enableMemoryTracking: true,
|
|
@@ -206,7 +205,7 @@ const warning = checkMultimodalSupport();
|
|
|
206
205
|
if (warning) {
|
|
207
206
|
console.warn(warning); // Experimental on iOS
|
|
208
207
|
} else {
|
|
209
|
-
// Image input (for vision models like Gemma
|
|
208
|
+
// Image input (for vision models like Gemma 4)
|
|
210
209
|
// Images >1024px are automatically resized to prevent OOM
|
|
211
210
|
const response = await llm.sendMessageWithImage(
|
|
212
211
|
"What's in this image?",
|
|
@@ -310,15 +309,20 @@ const buffer = tracker.getNativeBuffer();
|
|
|
310
309
|
|
|
311
310
|
Download `.litertlm` models automatically using the exported URL constants, or manually from [HuggingFace](https://huggingface.co/litert-community):
|
|
312
311
|
|
|
313
|
-
| Constant | Model
|
|
314
|
-
| :--------------------- |
|
|
315
|
-
| `
|
|
312
|
+
| Constant | Model | Size | Min RAM | Auth Required |
|
|
313
|
+
| :--------------------- | :--------------------------------- | :------- | :------ | :------------ |
|
|
314
|
+
| `GEMMA_4_E2B_IT` | Gemma 4 E2B (Multimodal, IT) | 2.58 GB | 4 GB+ | โ No |
|
|
315
|
+
| `GEMMA_4_E4B_IT` | Gemma 4 E4B (Higher Quality) | 3.65 GB | 6 GB+ | โ No |
|
|
316
|
+
| `GEMMA_3N_E2B_IT_INT4` | Gemma 3n E2B (Int4, Multimodal) | ~1.3 GB | 4 GB+ | โ
HuggingFace |
|
|
317
|
+
|
|
318
|
+
> **Recommended:** Use `GEMMA_4_E2B_IT` for most use cases. It's multimodal (text + vision + audio) and downloads directly from HuggingFace without requiring an account.
|
|
319
|
+
>
|
|
320
|
+
> **iOS Note:** Models larger than ~2 GB (like Gemma 4) require the `com.apple.developer.kernel.extended-virtual-addressing` entitlement. See [iOS Entitlements](#ios-entitlements) below.
|
|
316
321
|
|
|
317
322
|
**Other compatible models** (download manually from HuggingFace):
|
|
318
323
|
|
|
319
324
|
| Model | Size | Min RAM | Notes |
|
|
320
325
|
| ------------- | ------- | ------- | --------------------- |
|
|
321
|
-
| Gemma 3n E4B | ~4 GB | 8 GB+ | Higher quality |
|
|
322
326
|
| Gemma 3 1B | ~1 GB | 4 GB+ | Smallest, fastest |
|
|
323
327
|
| Phi-4 Mini | ~2 GB | 4 GB+ | Microsoft's small LLM |
|
|
324
328
|
| Qwen 2.5 1.5B | ~1.5 GB | 4 GB+ | Multilingual |
|
|
@@ -339,7 +343,7 @@ Loads a model from a local path or HTTPS URL.
|
|
|
339
343
|
| Parameter | Type | Default | Description |
|
|
340
344
|
| --------------------- | -------- | ------- | ----------------------------------------- |
|
|
341
345
|
| `path` | `string` | โ | Absolute path to `.litertlm` or HTTPS URL |
|
|
342
|
-
| `config.backend` | `string` | `'
|
|
346
|
+
| `config.backend` | `string` | `'cpu'` | `'cpu'`, `'gpu'`, or `'npu'` |
|
|
343
347
|
| `config.systemPrompt` | `string` | โ | System prompt for the model |
|
|
344
348
|
| `config.temperature` | `number` | `0.7` | Sampling temperature |
|
|
345
349
|
| `config.topK` | `number` | `40` | Top-K sampling |
|
|
@@ -354,7 +358,7 @@ Loads a model from a local path or HTTPS URL.
|
|
|
354
358
|
| `'gpu'` | GPU / Metal | Fast | Recommended default |
|
|
355
359
|
| `'npu'` | NPU / Neural Engine | Fastest | Requires supported hardware; falls back to GPU |
|
|
356
360
|
|
|
357
|
-
> **iOS**: `'gpu'`
|
|
361
|
+
> **iOS**: `'cpu'` is the recommended default backend. `'gpu'` (Metal/MPS) is also supported. The engine automatically tries multiple backend combinations if the primary one fails.
|
|
358
362
|
|
|
359
363
|
### `sendMessage(message): Promise<string>`
|
|
360
364
|
|
|
@@ -366,11 +370,11 @@ Streaming generation. Callback signature: `(token: string, isDone: boolean) => v
|
|
|
366
370
|
|
|
367
371
|
### `sendMessageWithImage(message, imagePath): Promise<string>`
|
|
368
372
|
|
|
369
|
-
Send a message with an image (
|
|
373
|
+
Send a message with an image (for vision models like Gemma 4 E2B).
|
|
370
374
|
|
|
371
375
|
### `sendMessageWithAudio(message, audioPath): Promise<string>`
|
|
372
376
|
|
|
373
|
-
Send a message with audio (
|
|
377
|
+
Send a message with audio (for audio-capable models like Gemma 4 E2B).
|
|
374
378
|
|
|
375
379
|
### `getStats(): GenerationStats`
|
|
376
380
|
|
|
@@ -448,7 +452,7 @@ const prompt = applyGemmaTemplate(
|
|
|
448
452
|
| react-native-nitro-modules | 0.35.0+ |
|
|
449
453
|
| Android API | 26+ (ARM64) |
|
|
450
454
|
| iOS | 15.0+ (ARM64) |
|
|
451
|
-
| LiteRT-LM Engine | 0.
|
|
455
|
+
| LiteRT-LM Engine | 0.10.1 |
|
|
452
456
|
|
|
453
457
|
## Platform Support
|
|
454
458
|
|
|
@@ -463,7 +467,8 @@ const prompt = applyGemmaTemplate(
|
|
|
463
467
|
| ---------------------------- | ------ | ----------------------------------------------------- |
|
|
464
468
|
| Text inference (blocking) | โ
| Via LiteRT-LM C API |
|
|
465
469
|
| Text inference (streaming) | โ
| Token-by-token callbacks |
|
|
466
|
-
|
|
|
470
|
+
| CPU inference | โ
| Recommended default backend |
|
|
471
|
+
| GPU inference (Metal/MPS) | โ
| Supported via `backend: 'gpu'` |
|
|
467
472
|
| Model download with progress | โ
| NSURLSession, cached in `Caches/` |
|
|
468
473
|
| Memory tracking | โ
| `mach_task_basic_info` |
|
|
469
474
|
| Multi-turn conversation | โ
| Context retained across turns |
|
|
@@ -471,6 +476,19 @@ const prompt = applyGemmaTemplate(
|
|
|
471
476
|
| Constrained decoding | โ | Requires llguidance Rust runtime |
|
|
472
477
|
| Function calling | โ | Requires Rust CXX bridge runtime |
|
|
473
478
|
|
|
479
|
+
### iOS Entitlements
|
|
480
|
+
|
|
481
|
+
Models larger than ~2 GB (like Gemma 4 E2B at 2.58 GB) require the **Extended Virtual Addressing** entitlement on iOS physical devices. Without it, iOS limits virtual memory to ~2 GB and the app will be killed by Jetsam.
|
|
482
|
+
|
|
483
|
+
Add to your app's `.entitlements` file:
|
|
484
|
+
|
|
485
|
+
```xml
|
|
486
|
+
<key>com.apple.developer.kernel.extended-virtual-addressing</key>
|
|
487
|
+
<true/>
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
> **Note:** This entitlement requires a **paid Apple Developer account** ($99/year). Gemma 3n E2B (~1.3 GB) works without it.
|
|
491
|
+
|
|
474
492
|
## Building the iOS Engine
|
|
475
493
|
|
|
476
494
|
The iOS build uses a **Bazel-to-XCFramework pipeline** that compiles the LiteRT-LM C engine and all transitive dependencies into a static library (~83 MB).
|
|
@@ -488,7 +506,7 @@ The iOS build uses a **Bazel-to-XCFramework pipeline** that compiles the LiteRT-
|
|
|
488
506
|
|
|
489
507
|
This will:
|
|
490
508
|
|
|
491
|
-
1. Clone/checkout LiteRT-LM `v0.
|
|
509
|
+
1. Clone/checkout LiteRT-LM `v0.10.1` source into `.litert-lm-build/`
|
|
492
510
|
2. Build `//c:engine` for `ios_arm64` and `ios_sim_arm64` via Bazel
|
|
493
511
|
3. Collect all transitive `.o` files (engine, protobuf, re2, sentencepiece, etc.)
|
|
494
512
|
4. Compile C/C++ stubs for unavailable Rust dependencies
|
|
@@ -540,7 +558,7 @@ Additionally, `PromptTemplate` is patched at build time to use a simplified C++
|
|
|
540
558
|
```
|
|
541
559
|
|
|
542
560
|
- **Android**: Kotlin (`HybridLiteRTLM.kt`) interfacing with the `litertlm-android` AAR.
|
|
543
|
-
- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM C API via a prebuilt `LiteRTLM.xcframework`. Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
|
|
561
|
+
- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM C API via a prebuilt `LiteRTLM.xcframework`. All engine operations (load, inference, streaming) run on dedicated `pthread` threads with 8 MB stack to accommodate XNNPack's stack requirements. Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
|
|
544
562
|
|
|
545
563
|
> **For contributors**: Changes to `cpp/HybridLiteRTLM.cpp` do not affect Android. Feature changes must be applied to both the Kotlin and C++ implementations.
|
|
546
564
|
|
package/android/build.gradle
CHANGED
|
@@ -9,9 +9,13 @@ plugins {
|
|
|
9
9
|
// Apply Nitrogen autolinking
|
|
10
10
|
apply from: '../nitrogen/generated/android/LiteRTLM+autolinking.gradle'
|
|
11
11
|
|
|
12
|
+
// Read LiteRT-LM SDK version from package.json (single source of truth)
|
|
13
|
+
def packageJson = new groovy.json.JsonSlurper().parseText(file('../package.json').text)
|
|
14
|
+
def litertLmVersion = packageJson.litertLm.androidMavenVersion
|
|
15
|
+
|
|
12
16
|
android {
|
|
13
17
|
namespace "dev.litert.litertlm"
|
|
14
|
-
compileSdk
|
|
18
|
+
compileSdk 36
|
|
15
19
|
|
|
16
20
|
defaultConfig {
|
|
17
21
|
minSdk 26 // LiteRT-LM requires API 26+
|
|
@@ -84,5 +88,5 @@ dependencies {
|
|
|
84
88
|
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3'
|
|
85
89
|
|
|
86
90
|
// LiteRT-LM Kotlin API
|
|
87
|
-
implementation
|
|
91
|
+
implementation "com.google.ai.edge.litertlm:litertlm-android:${litertLmVersion}"
|
|
88
92
|
}
|
|
@@ -44,8 +44,8 @@ internal class StreamingCallbackListener(
|
|
|
44
44
|
private val history: MutableList<Message>,
|
|
45
45
|
) : com.google.ai.edge.litertlm.MessageCallback {
|
|
46
46
|
|
|
47
|
-
override fun onMessage(responseMsg: com.google.ai.edge.litertlm.
|
|
48
|
-
val chunk = responseMsg.contents
|
|
47
|
+
override fun onMessage(responseMsg: com.google.ai.edge.litertlm.Message) {
|
|
48
|
+
val chunk = responseMsg.contents.contents
|
|
49
49
|
.filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
|
|
50
50
|
.joinToString("") { it.text }
|
|
51
51
|
|
|
@@ -123,7 +123,7 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
123
123
|
)
|
|
124
124
|
|
|
125
125
|
// Configuration
|
|
126
|
-
private var backend: Backend = Backend.
|
|
126
|
+
private var backend: Backend = Backend.CPU
|
|
127
127
|
private var temperature: Double = 0.7
|
|
128
128
|
private var topK: Int = 40
|
|
129
129
|
private var topP: Double = 0.95
|
|
@@ -161,21 +161,21 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
161
161
|
}
|
|
162
162
|
|
|
163
163
|
try {
|
|
164
|
-
// Map our Backend enum to LiteRT-LM Backend
|
|
164
|
+
// Map our Backend enum to LiteRT-LM Backend sealed class
|
|
165
165
|
val lmBackend = when (backend) {
|
|
166
|
-
Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
|
|
166
|
+
Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU()
|
|
167
167
|
Backend.NPU -> {
|
|
168
168
|
Log.i(TAG, "NPU backend requested - requires hardware support")
|
|
169
|
-
com.google.ai.edge.litertlm.Backend.NPU
|
|
169
|
+
com.google.ai.edge.litertlm.Backend.NPU()
|
|
170
170
|
}
|
|
171
|
-
else -> com.google.ai.edge.litertlm.Backend.CPU
|
|
171
|
+
else -> com.google.ai.edge.litertlm.Backend.CPU()
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
-
// Vision backend: hardcoded to GPU (required by Gemma
|
|
175
|
-
val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
|
|
174
|
+
// Vision backend: hardcoded to GPU (required by Gemma models)
|
|
175
|
+
val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU()
|
|
176
176
|
|
|
177
177
|
// Audio backend: hardcoded to CPU (optimal for audio processing)
|
|
178
|
-
val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
|
|
178
|
+
val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU()
|
|
179
179
|
|
|
180
180
|
Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
|
|
181
181
|
|
|
@@ -228,13 +228,13 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
228
228
|
Log.i(TAG, "sendMessage (Promise): $message")
|
|
229
229
|
|
|
230
230
|
// Blocking inference (safe here because we are in Promise.parallel worker thread)
|
|
231
|
-
val userMsg = LiteRTMessage.of(message)
|
|
231
|
+
val userMsg = LiteRTMessage.of(text = message)
|
|
232
232
|
val startTime = System.nanoTime()
|
|
233
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
233
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
234
234
|
val elapsedMs = (System.nanoTime() - startTime) / 1_000_000.0
|
|
235
235
|
|
|
236
236
|
// Extract text
|
|
237
|
-
val response = responseMsg.contents
|
|
237
|
+
val response = responseMsg.contents.contents
|
|
238
238
|
.filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
|
|
239
239
|
.joinToString("") { it.text }
|
|
240
240
|
|
|
@@ -242,6 +242,9 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
242
242
|
history.add(Message(Role.MODEL, response))
|
|
243
243
|
|
|
244
244
|
// Update stats with real timing data
|
|
245
|
+
// Token count heuristic: LiteRT-LM Android SDK does not expose
|
|
246
|
+
// actual token counts from inference. We approximate using
|
|
247
|
+
// ~4 chars/token. iOS uses the C API benchmark info for real counts.
|
|
245
248
|
val promptTokens = message.length / 4.0
|
|
246
249
|
val completionTokens = response.length / 4.0
|
|
247
250
|
lastStats = GenerationStats(
|
|
@@ -279,8 +282,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
279
282
|
)
|
|
280
283
|
|
|
281
284
|
try {
|
|
282
|
-
val userMsg = LiteRTMessage.of(message)
|
|
283
|
-
conversation!!.sendMessageAsync(userMsg, listener)
|
|
285
|
+
val userMsg = LiteRTMessage.of(text = message)
|
|
286
|
+
conversation!!.sendMessageAsync(message = userMsg, callback = listener)
|
|
284
287
|
} catch (e: Exception) {
|
|
285
288
|
Log.e(TAG, "Failed to initiate async generation", e)
|
|
286
289
|
onToken("Error: ${e.message}", true)
|
|
@@ -343,19 +346,14 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
343
346
|
// Use factory method Message.of passing a list of Content
|
|
344
347
|
val textContent = Content.Text(message)
|
|
345
348
|
|
|
346
|
-
val
|
|
347
|
-
textContent,
|
|
348
|
-
Content.ImageFile(processedImagePath)
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
val userMsg = LiteRTMessage.of(contentList)
|
|
349
|
+
val userMsg = LiteRTMessage.of(textContent, Content.ImageFile(processedImagePath))
|
|
352
350
|
|
|
353
351
|
// Add to history
|
|
354
352
|
history.add(Message(Role.USER, "$message [Image]"))
|
|
355
353
|
|
|
356
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
354
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
357
355
|
|
|
358
|
-
val response = responseMsg.contents
|
|
356
|
+
val response = responseMsg.contents.contents
|
|
359
357
|
.filterIsInstance<Content.Text>()
|
|
360
358
|
.joinToString("") { it.text }
|
|
361
359
|
|
|
@@ -490,18 +488,16 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
490
488
|
|
|
491
489
|
// Load audio
|
|
492
490
|
|
|
493
|
-
val
|
|
491
|
+
val userMsg = LiteRTMessage.of(
|
|
494
492
|
Content.Text(message),
|
|
495
493
|
Content.AudioFile(audioPath)
|
|
496
494
|
)
|
|
497
|
-
|
|
498
|
-
val userMsg = LiteRTMessage.of(contentList)
|
|
499
495
|
|
|
500
496
|
history.add(Message(Role.USER, "$message [Audio]"))
|
|
501
497
|
|
|
502
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
498
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
503
499
|
|
|
504
|
-
val response = responseMsg.contents
|
|
500
|
+
val response = responseMsg.contents.contents
|
|
505
501
|
.filterIsInstance<Content.Text>()
|
|
506
502
|
.joinToString("") { it.text }
|
|
507
503
|
|
|
@@ -628,8 +624,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
628
624
|
// Send system instruction as the first turn to prime the conversation.
|
|
629
625
|
// LiteRT-LM's Conversation API handles chat template formatting,
|
|
630
626
|
// including Gemma's <start_of_turn>system block.
|
|
631
|
-
val systemMsg = LiteRTMessage.of(
|
|
632
|
-
conversation!!.sendMessage(systemMsg)
|
|
627
|
+
val systemMsg = LiteRTMessage.of(Content.Text(prompt))
|
|
628
|
+
conversation!!.sendMessage(message = systemMsg)
|
|
633
629
|
Log.i(TAG, "System prompt applied (${prompt.length} chars)")
|
|
634
630
|
} catch (e: Exception) {
|
|
635
631
|
Log.w(TAG, "Failed to apply system prompt: ${e.message}")
|
package/app.plugin.js
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
* Expo config plugin for react-native-litert-lm.
|
|
3
3
|
*
|
|
4
4
|
* Ensures correct build settings for the LiteRT-LM native module:
|
|
5
|
-
* - Android: minSdkVersion 26,
|
|
6
|
-
* - iOS: deployment target 15.0
|
|
5
|
+
* - Android: minSdkVersion 26, Kotlin 2.3.0 (required by litertlm-android AAR)
|
|
7
6
|
*/
|
|
8
|
-
const {
|
|
7
|
+
const {
|
|
8
|
+
withGradleProperties,
|
|
9
|
+
withProjectBuildGradle,
|
|
10
|
+
} = require('@expo/config-plugins');
|
|
9
11
|
|
|
10
12
|
function withLiteRTLM(config) {
|
|
11
13
|
// Android: Ensure minSdkVersion is at least 26
|
|
@@ -27,6 +29,29 @@ function withLiteRTLM(config) {
|
|
|
27
29
|
return config;
|
|
28
30
|
});
|
|
29
31
|
|
|
32
|
+
// Android: Pin Kotlin Gradle plugin to 2.3.0
|
|
33
|
+
// The litertlm-android AAR uses Kotlin 2.3.0 metadata (version defined in
|
|
34
|
+
// package.json โ litertLm.androidMavenVersion).
|
|
35
|
+
// React Native's default Kotlin version (2.1.0) cannot read this metadata,
|
|
36
|
+
// so we must force the Kotlin Gradle plugin to 2.3.0 in the project-level
|
|
37
|
+
// build.gradle. This ensures the fix survives `expo prebuild --clean`.
|
|
38
|
+
config = withProjectBuildGradle(config, (config) => {
|
|
39
|
+
if (config.modResults.language === 'groovy') {
|
|
40
|
+
const contents = config.modResults.contents;
|
|
41
|
+
|
|
42
|
+
// Only add if not already pinned
|
|
43
|
+
if (!contents.includes("kotlin-gradle-plugin:2.3.0")) {
|
|
44
|
+
// Replace the unversioned kotlin-gradle-plugin classpath with a pinned one
|
|
45
|
+
config.modResults.contents = contents.replace(
|
|
46
|
+
"classpath('org.jetbrains.kotlin:kotlin-gradle-plugin')",
|
|
47
|
+
"classpath('org.jetbrains.kotlin:kotlin-gradle-plugin:2.3.0')"
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return config;
|
|
53
|
+
});
|
|
54
|
+
|
|
30
55
|
return config;
|
|
31
56
|
}
|
|
32
57
|
|
package/cpp/HybridLiteRTLM.cpp
CHANGED
|
@@ -23,13 +23,52 @@
|
|
|
23
23
|
|
|
24
24
|
#ifdef __APPLE__
|
|
25
25
|
#include "IOSDownloadHelper.h"
|
|
26
|
+
#include <os/proc.h>
|
|
26
27
|
#endif
|
|
27
28
|
#include <fstream>
|
|
28
29
|
#include <thread>
|
|
29
30
|
#include <regex>
|
|
31
|
+
#include <pthread.h>
|
|
32
|
+
#include <functional>
|
|
30
33
|
|
|
31
34
|
namespace margelo::nitro::litertlm {
|
|
32
35
|
|
|
36
|
+
// =============================================================================
|
|
37
|
+
// Thread Helper โ LiteRT engine operations need >512KB stack (XNNPack, Metal)
|
|
38
|
+
// =============================================================================
|
|
39
|
+
|
|
40
|
+
static void runOnLargeStack(std::function<void()> work, size_t stackSize = 8 * 1024 * 1024) {
|
|
41
|
+
struct Context {
|
|
42
|
+
std::function<void()> fn;
|
|
43
|
+
std::exception_ptr exception;
|
|
44
|
+
};
|
|
45
|
+
Context ctx{std::move(work), nullptr};
|
|
46
|
+
|
|
47
|
+
pthread_t thread;
|
|
48
|
+
pthread_attr_t attr;
|
|
49
|
+
pthread_attr_init(&attr);
|
|
50
|
+
pthread_attr_setstacksize(&attr, stackSize);
|
|
51
|
+
|
|
52
|
+
int rc = pthread_create(&thread, &attr, [](void* arg) -> void* {
|
|
53
|
+
auto* c = static_cast<Context*>(arg);
|
|
54
|
+
try {
|
|
55
|
+
c->fn();
|
|
56
|
+
} catch (...) {
|
|
57
|
+
c->exception = std::current_exception();
|
|
58
|
+
}
|
|
59
|
+
return nullptr;
|
|
60
|
+
}, &ctx);
|
|
61
|
+
pthread_attr_destroy(&attr);
|
|
62
|
+
if (rc != 0) {
|
|
63
|
+
throw std::runtime_error("Failed to create large-stack thread (errno: " + std::to_string(rc) + ")");
|
|
64
|
+
}
|
|
65
|
+
pthread_join(thread, nullptr);
|
|
66
|
+
|
|
67
|
+
if (ctx.exception) {
|
|
68
|
+
std::rethrow_exception(ctx.exception);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
33
72
|
// =============================================================================
|
|
34
73
|
// JSON Helpers
|
|
35
74
|
// =============================================================================
|
|
@@ -70,6 +109,34 @@ std::string HybridLiteRTLM::buildAudioMessageJson(const std::string& text, const
|
|
|
70
109
|
"]}";
|
|
71
110
|
}
|
|
72
111
|
|
|
112
|
+
/**
|
|
113
|
+
* Strip Gemma / LiteRT-LM control tokens from model output.
|
|
114
|
+
* The iOS C API returns raw model text including stop/turn markers
|
|
115
|
+
* that the Android Kotlin SDK strips automatically.
|
|
116
|
+
*/
|
|
117
|
+
static std::string stripControlTokens(const std::string& text) {
|
|
118
|
+
static const char* tokens[] = {
|
|
119
|
+
"<end_of_turn>",
|
|
120
|
+
"<start_of_turn>model",
|
|
121
|
+
"<start_of_turn>user",
|
|
122
|
+
"<start_of_turn>",
|
|
123
|
+
"<eos>",
|
|
124
|
+
};
|
|
125
|
+
std::string result = text;
|
|
126
|
+
for (auto* tok : tokens) {
|
|
127
|
+
std::string t(tok);
|
|
128
|
+
size_t pos;
|
|
129
|
+
while ((pos = result.find(t)) != std::string::npos) {
|
|
130
|
+
result.erase(pos, t.length());
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
// Trim leading/trailing whitespace
|
|
134
|
+
size_t start = result.find_first_not_of(" \t\n\r");
|
|
135
|
+
if (start == std::string::npos) return "";
|
|
136
|
+
size_t end = result.find_last_not_of(" \t\n\r");
|
|
137
|
+
return result.substr(start, end - start + 1);
|
|
138
|
+
}
|
|
139
|
+
|
|
73
140
|
std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonResponse) {
|
|
74
141
|
// The C API response JSON is structured as:
|
|
75
142
|
// {"role":"model","content":[{"type":"text","text":"..."}]}
|
|
@@ -102,7 +169,7 @@ std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonRespo
|
|
|
102
169
|
result += jsonResponse[i];
|
|
103
170
|
}
|
|
104
171
|
}
|
|
105
|
-
return result;
|
|
172
|
+
return stripControlTokens(result);
|
|
106
173
|
}
|
|
107
174
|
}
|
|
108
175
|
|
|
@@ -125,11 +192,11 @@ std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonRespo
|
|
|
125
192
|
result += jsonResponse[i];
|
|
126
193
|
}
|
|
127
194
|
}
|
|
128
|
-
return result;
|
|
195
|
+
return stripControlTokens(result);
|
|
129
196
|
}
|
|
130
197
|
|
|
131
|
-
// Fallback: return full response
|
|
132
|
-
return jsonResponse;
|
|
198
|
+
// Fallback: return full response (still strip control tokens)
|
|
199
|
+
return stripControlTokens(jsonResponse);
|
|
133
200
|
}
|
|
134
201
|
|
|
135
202
|
// =============================================================================
|
|
@@ -191,7 +258,9 @@ std::shared_ptr<Promise<void>> HybridLiteRTLM::loadModel(
|
|
|
191
258
|
const std::string& modelPath,
|
|
192
259
|
const std::optional<LLMConfig>& config) {
|
|
193
260
|
return Promise<void>::async([this, modelPath, config]() {
|
|
194
|
-
|
|
261
|
+
runOnLargeStack([&]() {
|
|
262
|
+
loadModelInternal(modelPath, config);
|
|
263
|
+
});
|
|
195
264
|
});
|
|
196
265
|
}
|
|
197
266
|
|
|
@@ -243,7 +312,7 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
243
312
|
modelPath.c_str(),
|
|
244
313
|
backend,
|
|
245
314
|
visionBackend,
|
|
246
|
-
|
|
315
|
+
"cpu" // audio executor: iOS XCFramework lacks compiled audio ops (INTERNAL ERROR at Invoke)
|
|
247
316
|
);
|
|
248
317
|
if (!settings) {
|
|
249
318
|
return false;
|
|
@@ -336,7 +405,11 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
336
405
|
|
|
337
406
|
std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessage(const std::string& message) {
|
|
338
407
|
return Promise<std::string>::async([this, message]() -> std::string {
|
|
339
|
-
|
|
408
|
+
std::string result;
|
|
409
|
+
runOnLargeStack([&]() {
|
|
410
|
+
result = sendMessageInternal(message);
|
|
411
|
+
});
|
|
412
|
+
return result;
|
|
340
413
|
});
|
|
341
414
|
}
|
|
342
415
|
|
|
@@ -431,9 +504,13 @@ void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
|
|
|
431
504
|
|
|
432
505
|
if (chunk) {
|
|
433
506
|
std::string token(chunk);
|
|
434
|
-
|
|
507
|
+
// Filter out Gemma control tokens from streamed chunks
|
|
508
|
+
std::string cleaned = stripControlTokens(token);
|
|
509
|
+
ctx->fullResponse += cleaned;
|
|
435
510
|
ctx->tokenCount++;
|
|
436
|
-
|
|
511
|
+
if (!cleaned.empty()) {
|
|
512
|
+
ctx->onToken(cleaned, false);
|
|
513
|
+
}
|
|
437
514
|
}
|
|
438
515
|
}
|
|
439
516
|
|
|
@@ -445,34 +522,42 @@ void HybridLiteRTLM::sendMessageAsync(
|
|
|
445
522
|
auto onTokenCopy = onToken;
|
|
446
523
|
auto messageCopy = message;
|
|
447
524
|
|
|
448
|
-
// Capture shared state safely
|
|
449
|
-
auto
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
525
|
+
// Capture shared state safely โ use unique_ptr to prevent leaks
|
|
526
|
+
auto ctxOwner = std::make_unique<StreamContext>();
|
|
527
|
+
ctxOwner->onToken = std::move(onTokenCopy);
|
|
528
|
+
ctxOwner->fullResponse = "";
|
|
529
|
+
ctxOwner->history = &history_;
|
|
530
|
+
ctxOwner->historyMutex = &mutex_;
|
|
531
|
+
ctxOwner->userMessage = messageCopy;
|
|
532
|
+
ctxOwner->lastStats = &lastStats_;
|
|
533
|
+
ctxOwner->startTime = std::chrono::steady_clock::now();
|
|
534
|
+
ctxOwner->tokenCount = 0;
|
|
458
535
|
|
|
459
536
|
#ifdef __APPLE__
|
|
460
537
|
ensureLoaded();
|
|
461
538
|
|
|
462
539
|
std::string msgJson = buildTextMessageJson(messageCopy);
|
|
463
540
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
541
|
+
// Release ownership โ the C callback now owns the context via raw pointer.
|
|
542
|
+
// streamCallbackFn will delete it when done or on error.
|
|
543
|
+
StreamContext* ctx = ctxOwner.release();
|
|
467
544
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
545
|
+
// Wrap the initial engine call in runOnLargeStack for consistency
|
|
546
|
+
// with all other engine entry points (XNNPack needs >512KB stack).
|
|
547
|
+
runOnLargeStack([&]() {
|
|
548
|
+
int result = litert_lm_conversation_send_message_stream(
|
|
549
|
+
conversation_, msgJson.c_str(), nullptr,
|
|
550
|
+
streamCallbackFn, ctx);
|
|
551
|
+
|
|
552
|
+
if (result != 0) {
|
|
553
|
+
delete ctx;
|
|
554
|
+
throw std::runtime_error("LiteRT-LM: Failed to start streaming inference");
|
|
555
|
+
}
|
|
556
|
+
});
|
|
472
557
|
#else
|
|
473
558
|
// Non-Apple stub
|
|
474
|
-
|
|
475
|
-
|
|
559
|
+
ctxOwner->onToken("[iOS only] Streaming not available on this platform.", true);
|
|
560
|
+
// ctxOwner auto-deleted by unique_ptr
|
|
476
561
|
#endif
|
|
477
562
|
}
|
|
478
563
|
|
|
@@ -484,7 +569,11 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithImage(
|
|
|
484
569
|
const std::string& message,
|
|
485
570
|
const std::string& imagePath) {
|
|
486
571
|
return Promise<std::string>::async([this, message, imagePath]() -> std::string {
|
|
487
|
-
|
|
572
|
+
std::string result;
|
|
573
|
+
runOnLargeStack([&]() {
|
|
574
|
+
result = sendMessageWithImageInternal(message, imagePath);
|
|
575
|
+
});
|
|
576
|
+
return result;
|
|
488
577
|
});
|
|
489
578
|
}
|
|
490
579
|
|
|
@@ -547,7 +636,11 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithAudio(
|
|
|
547
636
|
const std::string& message,
|
|
548
637
|
const std::string& audioPath) {
|
|
549
638
|
return Promise<std::string>::async([this, message, audioPath]() -> std::string {
|
|
550
|
-
|
|
639
|
+
std::string result;
|
|
640
|
+
runOnLargeStack([&]() {
|
|
641
|
+
result = sendMessageWithAudioInternal(message, audioPath);
|
|
642
|
+
});
|
|
643
|
+
return result;
|
|
551
644
|
});
|
|
552
645
|
}
|
|
553
646
|
|
|
@@ -574,7 +667,12 @@ std::string HybridLiteRTLM::sendMessageWithAudioInternal(
|
|
|
574
667
|
conversation_, msgJson.c_str(), nullptr);
|
|
575
668
|
|
|
576
669
|
if (!response) {
|
|
577
|
-
|
|
670
|
+
std::string errMsg = "LiteRT-LM: sendMessageWithAudio failed";
|
|
671
|
+
const char* nativeErr = litert_lm_get_last_error();
|
|
672
|
+
if (nativeErr && nativeErr[0] != '\0') {
|
|
673
|
+
errMsg += ": " + std::string(nativeErr);
|
|
674
|
+
}
|
|
675
|
+
throw std::runtime_error(errMsg);
|
|
578
676
|
}
|
|
579
677
|
|
|
580
678
|
const char* responseStr = litert_lm_json_response_get_string(response);
|
|
@@ -607,16 +705,8 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::downloadModel(
|
|
|
607
705
|
#ifdef __APPLE__
|
|
608
706
|
return litert_lm::downloadModelFile(url, fileName, onProgress);
|
|
609
707
|
#else
|
|
610
|
-
|
|
611
|
-
std::
|
|
612
|
-
int result = system(curlCmd.c_str());
|
|
613
|
-
if (result != 0) {
|
|
614
|
-
throw std::runtime_error("Failed to download model from: " + url);
|
|
615
|
-
}
|
|
616
|
-
if (onProgress.has_value()) {
|
|
617
|
-
onProgress.value()(1.0);
|
|
618
|
-
}
|
|
619
|
-
return destPath;
|
|
708
|
+
// Non-Apple platforms: not supported from C++ (Android uses Kotlin)
|
|
709
|
+
throw std::runtime_error("Download not available on this platform. Use the Kotlin implementation.");
|
|
620
710
|
#endif
|
|
621
711
|
});
|
|
622
712
|
}
|
|
@@ -688,8 +778,8 @@ GenerationStats HybridLiteRTLM::getStats() {
|
|
|
688
778
|
// =============================================================================
|
|
689
779
|
|
|
690
780
|
MemoryUsage HybridLiteRTLM::getMemoryUsage() {
|
|
691
|
-
double
|
|
692
|
-
double
|
|
781
|
+
double nativeHeapBytes = 0;
|
|
782
|
+
double residentBytes = 0;
|
|
693
783
|
double availableBytes = 0;
|
|
694
784
|
bool isLowMemory = false;
|
|
695
785
|
|
|
@@ -704,33 +794,26 @@ MemoryUsage HybridLiteRTLM::getMemoryUsage() {
|
|
|
704
794
|
&count);
|
|
705
795
|
|
|
706
796
|
if (kr == KERN_SUCCESS) {
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
mach_port_t host_port = mach_host_self();
|
|
712
|
-
struct host_basic_info hostInfo;
|
|
713
|
-
mach_msg_type_number_t hostCount = HOST_BASIC_INFO_COUNT;
|
|
714
|
-
|
|
715
|
-
kr = host_info(host_port, HOST_BASIC_INFO,
|
|
716
|
-
(host_info_t)&hostInfo, &hostCount);
|
|
717
|
-
|
|
718
|
-
if (kr == KERN_SUCCESS) {
|
|
719
|
-
totalMemoryBytes = static_cast<double>(hostInfo.max_mem);
|
|
797
|
+
residentBytes = static_cast<double>(info.resident_size);
|
|
798
|
+
// On iOS, mach_task_basic_info doesn't separate heap from RSS.
|
|
799
|
+
// Use resident_size_max as a proxy for peak native allocation.
|
|
800
|
+
nativeHeapBytes = static_cast<double>(info.resident_size);
|
|
720
801
|
}
|
|
721
802
|
|
|
722
|
-
|
|
723
|
-
|
|
803
|
+
// Use os_proc_available_memory() (iOS 13+) for accurate Jetsam headroom.
|
|
804
|
+
// This reports how much memory the process can still allocate before
|
|
805
|
+
// the system kills it โ far more accurate than total_physical - process_rss.
|
|
806
|
+
availableBytes = static_cast<double>(os_proc_available_memory());
|
|
724
807
|
|
|
725
808
|
// Low memory threshold (~200MB available)
|
|
726
|
-
isLowMemory =
|
|
809
|
+
isLowMemory = availableBytes < 200.0 * 1024.0 * 1024.0;
|
|
727
810
|
#endif
|
|
728
811
|
|
|
729
812
|
return MemoryUsage{
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
availableBytes,
|
|
733
|
-
isLowMemory
|
|
813
|
+
nativeHeapBytes, // nativeHeapBytes (RSS as proxy on iOS)
|
|
814
|
+
residentBytes, // residentBytes
|
|
815
|
+
availableBytes, // availableMemoryBytes
|
|
816
|
+
isLowMemory // isLowMemory
|
|
734
817
|
};
|
|
735
818
|
}
|
|
736
819
|
|
package/cpp/HybridLiteRTLM.hpp
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
// react-native-litert-lm
|
|
4
4
|
//
|
|
5
5
|
// High-performance LLM inference using LiteRT-LM.
|
|
6
|
-
// Supports Gemma 3n and other .litertlm models.
|
|
6
|
+
// Supports Gemma 4, Gemma 3n, and other .litertlm models.
|
|
7
7
|
//
|
|
8
8
|
// NOTE: This C++ implementation is used for iOS ONLY.
|
|
9
9
|
// Android uses the Kotlin implementation in `android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt`.
|
|
@@ -112,7 +112,7 @@ private:
|
|
|
112
112
|
mutable std::mutex mutex_;
|
|
113
113
|
|
|
114
114
|
// Configuration - backend
|
|
115
|
-
Backend backend_ = Backend::
|
|
115
|
+
Backend backend_ = Backend::CPU;
|
|
116
116
|
|
|
117
117
|
// System prompt / instruction
|
|
118
118
|
std::string systemPrompt_;
|
package/lib/hooks.js
CHANGED
|
@@ -51,6 +51,10 @@ function useModel(pathOrUrl, config) {
|
|
|
51
51
|
enableMemoryTracking,
|
|
52
52
|
maxMemorySnapshots,
|
|
53
53
|
});
|
|
54
|
+
// Reset ready state โ the new instance has no model loaded yet.
|
|
55
|
+
// This prevents stale isReady=true after Fast Refresh (which
|
|
56
|
+
// preserves useState but re-runs useEffect).
|
|
57
|
+
setIsReady(false);
|
|
54
58
|
// Cleanup on unmount
|
|
55
59
|
return () => {
|
|
56
60
|
try {
|
package/lib/index.d.ts
CHANGED
|
@@ -45,6 +45,10 @@ export { createLLM } from "./modelFactory";
|
|
|
45
45
|
* Use with model download utilities or as reference.
|
|
46
46
|
*/
|
|
47
47
|
export declare const Models: {
|
|
48
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
49
|
+
readonly GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm";
|
|
50
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
51
|
+
readonly GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm";
|
|
48
52
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
49
53
|
readonly GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview";
|
|
50
54
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -59,9 +63,10 @@ export declare const Models: {
|
|
|
59
63
|
export type ModelId = (typeof Models)[keyof typeof Models];
|
|
60
64
|
/**
|
|
61
65
|
* Get the recommended backend for the current platform.
|
|
62
|
-
* Returns '
|
|
66
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
67
|
+
* is faster but may not be available on all devices or model configurations.
|
|
63
68
|
*
|
|
64
|
-
* @returns The recommended backend ('
|
|
69
|
+
* @returns The recommended backend ('cpu')
|
|
65
70
|
*
|
|
66
71
|
* @example
|
|
67
72
|
* ```typescript
|
|
@@ -106,5 +111,17 @@ export declare function checkBackendSupport(backend: Backend): string | undefine
|
|
|
106
111
|
export declare function checkMultimodalSupport(): string | undefined;
|
|
107
112
|
/**
|
|
108
113
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
114
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
109
115
|
*/
|
|
110
116
|
export declare const GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
117
|
+
/**
|
|
118
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
119
|
+
* Public โ no HuggingFace account required.
|
|
120
|
+
*/
|
|
121
|
+
export declare const GEMMA_4_E2B_IT = "https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
122
|
+
/**
|
|
123
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
124
|
+
* Higher quality than E2B but requires more device memory.
|
|
125
|
+
* Public โ no HuggingFace account required.
|
|
126
|
+
*/
|
|
127
|
+
export declare const GEMMA_4_E4B_IT = "https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
package/lib/index.js
CHANGED
|
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
exports.GEMMA_3N_E2B_IT_INT4 = exports.Models = exports.createLLM = exports.createNativeBuffer = exports.createMemoryTracker = exports.applyLlamaTemplate = exports.applyPhiTemplate = exports.applyGemmaTemplate = void 0;
|
|
17
|
+
exports.GEMMA_4_E4B_IT = exports.GEMMA_4_E2B_IT = exports.GEMMA_3N_E2B_IT_INT4 = exports.Models = exports.createLLM = exports.createNativeBuffer = exports.createMemoryTracker = exports.applyLlamaTemplate = exports.applyPhiTemplate = exports.applyGemmaTemplate = void 0;
|
|
18
18
|
exports.getRecommendedBackend = getRecommendedBackend;
|
|
19
19
|
exports.checkBackendSupport = checkBackendSupport;
|
|
20
20
|
exports.checkMultimodalSupport = checkMultimodalSupport;
|
|
@@ -67,6 +67,10 @@ Object.defineProperty(exports, "createLLM", { enumerable: true, get: function ()
|
|
|
67
67
|
* Use with model download utilities or as reference.
|
|
68
68
|
*/
|
|
69
69
|
exports.Models = {
|
|
70
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
71
|
+
GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm",
|
|
72
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
73
|
+
GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm",
|
|
70
74
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
71
75
|
GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview",
|
|
72
76
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -80,9 +84,10 @@ exports.Models = {
|
|
|
80
84
|
};
|
|
81
85
|
/**
|
|
82
86
|
* Get the recommended backend for the current platform.
|
|
83
|
-
* Returns '
|
|
87
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
88
|
+
* is faster but may not be available on all devices or model configurations.
|
|
84
89
|
*
|
|
85
|
-
* @returns The recommended backend ('
|
|
90
|
+
* @returns The recommended backend ('cpu')
|
|
86
91
|
*
|
|
87
92
|
* @example
|
|
88
93
|
* ```typescript
|
|
@@ -91,9 +96,9 @@ exports.Models = {
|
|
|
91
96
|
* ```
|
|
92
97
|
*/
|
|
93
98
|
function getRecommendedBackend() {
|
|
94
|
-
//
|
|
95
|
-
//
|
|
96
|
-
return "
|
|
99
|
+
// CPU is the safe default โ always available, broadly compatible.
|
|
100
|
+
// GPU is faster but may fail on some models/devices.
|
|
101
|
+
return "cpu";
|
|
97
102
|
}
|
|
98
103
|
/**
|
|
99
104
|
* Check if a backend configuration is supported on the current platform.
|
|
@@ -140,11 +145,23 @@ function checkBackendSupport(backend) {
|
|
|
140
145
|
*/
|
|
141
146
|
function checkMultimodalSupport() {
|
|
142
147
|
if (react_native_1.Platform.OS === "ios") {
|
|
143
|
-
return "Multimodal (image/audio) is
|
|
148
|
+
return "Multimodal (image/audio) is not available on iOS. The XCFramework lacks compiled vision and audio executor ops.";
|
|
144
149
|
}
|
|
145
150
|
return undefined;
|
|
146
151
|
}
|
|
147
152
|
/**
|
|
148
153
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
154
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
149
155
|
*/
|
|
150
156
|
exports.GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
157
|
+
/**
|
|
158
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
159
|
+
* Public โ no HuggingFace account required.
|
|
160
|
+
*/
|
|
161
|
+
exports.GEMMA_4_E2B_IT = "https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
162
|
+
/**
|
|
163
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
164
|
+
* Higher quality than E2B but requires more device memory.
|
|
165
|
+
* Public โ no HuggingFace account required.
|
|
166
|
+
*/
|
|
167
|
+
exports.GEMMA_4_E4B_IT = "https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
|
@@ -26,18 +26,18 @@ export interface LLMConfig {
|
|
|
26
26
|
systemPrompt?: string;
|
|
27
27
|
/**
|
|
28
28
|
* Primary compute backend for text generation.
|
|
29
|
-
* - 'cpu': CPU inference (
|
|
30
|
-
* - 'gpu': GPU acceleration (fast,
|
|
29
|
+
* - 'cpu': CPU inference (safe default, always available)
|
|
30
|
+
* - 'gpu': GPU acceleration (fast, Metal on iOS, GPU delegate on Android)
|
|
31
31
|
* - 'npu': NPU/Neural Engine (fastest on supported devices)
|
|
32
32
|
*
|
|
33
|
-
* If not specified, defaults to '
|
|
33
|
+
* If not specified, defaults to 'cpu'.
|
|
34
34
|
* If specified backend is unavailable, falls back automatically.
|
|
35
35
|
*
|
|
36
36
|
* @remarks
|
|
37
|
-
* Vision encoder is always set to GPU (required by Gemma
|
|
37
|
+
* Vision encoder is always set to GPU (required by Gemma models).
|
|
38
38
|
* Audio encoder is always set to CPU (optimal for audio processing).
|
|
39
39
|
*
|
|
40
|
-
* @default '
|
|
40
|
+
* @default 'cpu'
|
|
41
41
|
*/
|
|
42
42
|
backend?: Backend;
|
|
43
43
|
/**
|
|
@@ -104,12 +104,12 @@ export interface MemoryUsage {
|
|
|
104
104
|
}
|
|
105
105
|
/**
|
|
106
106
|
* LiteRT-LM: High-performance LLM inference engine.
|
|
107
|
-
* Supports Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
107
|
+
* Supports Gemma 4, Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
108
108
|
*
|
|
109
109
|
* @example
|
|
110
110
|
* ```typescript
|
|
111
111
|
* const llm = createLLM();
|
|
112
|
-
* llm.loadModel('/path/to/gemma-
|
|
112
|
+
* llm.loadModel('/path/to/gemma-4-E2B-it.litertlm', { backend: 'cpu' });
|
|
113
113
|
*
|
|
114
114
|
* // Blocking generation
|
|
115
115
|
* const response = llm.sendMessage('What is the capital of France?');
|
package/package.json
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "react-native-litert-lm",
|
|
3
|
-
"version": "0.3.
|
|
4
|
-
"
|
|
3
|
+
"version": "0.3.3",
|
|
4
|
+
"litertLm": {
|
|
5
|
+
"version": "0.10.1",
|
|
6
|
+
"androidMavenVersion": "0.10.0",
|
|
7
|
+
"iosGitTag": "v0.10.1"
|
|
8
|
+
},
|
|
9
|
+
"description": "High-performance LLM inference for React Native using LiteRT-LM. Optimized for Gemma 4 and other on-device language models.",
|
|
5
10
|
"license": "MIT",
|
|
6
11
|
"author": "Hugh Chen (https://github.com/hung-yueh)",
|
|
7
12
|
"repository": {
|
|
@@ -19,6 +24,7 @@
|
|
|
19
24
|
"litert-lm",
|
|
20
25
|
"llm",
|
|
21
26
|
"gemma",
|
|
27
|
+
"gemma-4",
|
|
22
28
|
"gemma-3n",
|
|
23
29
|
"ai",
|
|
24
30
|
"machine-learning",
|
|
@@ -69,26 +75,26 @@
|
|
|
69
75
|
"release": "release-it"
|
|
70
76
|
},
|
|
71
77
|
"devDependencies": {
|
|
72
|
-
"@expo/config-plugins": "~
|
|
73
|
-
"@types/react": "~19.
|
|
74
|
-
"expo": "^54.0.31",
|
|
75
|
-
"nitrogen": "^0.35.0",
|
|
76
|
-
"react": "19.1.0",
|
|
77
|
-
"react-native": "0.81.5",
|
|
78
|
+
"@expo/config-plugins": "~55.0.0",
|
|
79
|
+
"@types/react": "~19.2.10",
|
|
78
80
|
"release-it": "^19.2.4",
|
|
79
81
|
"typescript": "^5.0.0"
|
|
80
82
|
},
|
|
81
83
|
"peerDependencies": {
|
|
82
|
-
"expo": ">=
|
|
84
|
+
"expo": ">=55.0.0",
|
|
83
85
|
"react": "*",
|
|
84
|
-
"react-native": "*"
|
|
86
|
+
"react-native": "*",
|
|
87
|
+
"react-native-nitro-modules": "^0.35.0"
|
|
85
88
|
},
|
|
86
89
|
"peerDependenciesMeta": {
|
|
87
90
|
"expo": {
|
|
88
91
|
"optional": true
|
|
92
|
+
},
|
|
93
|
+
"react": {
|
|
94
|
+
"optional": true
|
|
95
|
+
},
|
|
96
|
+
"react-native": {
|
|
97
|
+
"optional": true
|
|
89
98
|
}
|
|
90
|
-
},
|
|
91
|
-
"dependencies": {
|
|
92
|
-
"react-native-nitro-modules": "^0.35.0"
|
|
93
99
|
}
|
|
94
100
|
}
|
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
|
|
17
17
|
set -euo pipefail
|
|
18
18
|
|
|
19
|
-
LITERT_LM_VERSION="v0.9.0"
|
|
20
19
|
LITERT_LM_REPO="https://github.com/google-ai-edge/LiteRT-LM.git"
|
|
21
20
|
FRAMEWORK_NAME="LiteRTLM"
|
|
22
21
|
|
|
23
22
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
24
23
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
24
|
+
LITERT_LM_VERSION="$(node -e "console.log(require('$PROJECT_ROOT/package.json').litertLm.iosGitTag)")"
|
|
25
25
|
OUTPUT_DIR="$PROJECT_ROOT/ios/Frameworks"
|
|
26
26
|
C_API_HEADER_DIR="$PROJECT_ROOT/cpp/include"
|
|
27
27
|
BUILD_DIR="$PROJECT_ROOT/.litert-lm-build"
|
|
@@ -19,7 +19,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
19
19
|
OUTPUT_DIR="$PROJECT_ROOT/ios/Frameworks"
|
|
20
20
|
C_API_HEADER_DIR="$PROJECT_ROOT/cpp/include"
|
|
21
21
|
|
|
22
|
-
LITERT_LM_VERSION="
|
|
22
|
+
LITERT_LM_VERSION="$(node -e "console.log(require('$PROJECT_ROOT/package.json').litertLm.iosGitTag)")"
|
|
23
23
|
GITHUB_RAW="https://github.com/google-ai-edge/LiteRT-LM/raw/${LITERT_LM_VERSION}"
|
|
24
24
|
|
|
25
25
|
# Read version from package.json
|
package/src/hooks.ts
CHANGED
|
@@ -108,6 +108,11 @@ export function useModel(
|
|
|
108
108
|
maxMemorySnapshots,
|
|
109
109
|
});
|
|
110
110
|
|
|
111
|
+
// Reset ready state โ the new instance has no model loaded yet.
|
|
112
|
+
// This prevents stale isReady=true after Fast Refresh (which
|
|
113
|
+
// preserves useState but re-runs useEffect).
|
|
114
|
+
setIsReady(false);
|
|
115
|
+
|
|
111
116
|
// Cleanup on unmount
|
|
112
117
|
return () => {
|
|
113
118
|
try {
|
package/src/index.ts
CHANGED
|
@@ -79,6 +79,10 @@ export { createLLM } from "./modelFactory";
|
|
|
79
79
|
* Use with model download utilities or as reference.
|
|
80
80
|
*/
|
|
81
81
|
export const Models = {
|
|
82
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
83
|
+
GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm",
|
|
84
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
85
|
+
GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm",
|
|
82
86
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
83
87
|
GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview",
|
|
84
88
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -95,9 +99,10 @@ export type ModelId = (typeof Models)[keyof typeof Models];
|
|
|
95
99
|
|
|
96
100
|
/**
|
|
97
101
|
* Get the recommended backend for the current platform.
|
|
98
|
-
* Returns '
|
|
102
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
103
|
+
* is faster but may not be available on all devices or model configurations.
|
|
99
104
|
*
|
|
100
|
-
* @returns The recommended backend ('
|
|
105
|
+
* @returns The recommended backend ('cpu')
|
|
101
106
|
*
|
|
102
107
|
* @example
|
|
103
108
|
* ```typescript
|
|
@@ -106,9 +111,9 @@ export type ModelId = (typeof Models)[keyof typeof Models];
|
|
|
106
111
|
* ```
|
|
107
112
|
*/
|
|
108
113
|
export function getRecommendedBackend(): Backend {
|
|
109
|
-
//
|
|
110
|
-
//
|
|
111
|
-
return "
|
|
114
|
+
// CPU is the safe default โ always available, broadly compatible.
|
|
115
|
+
// GPU is faster but may fail on some models/devices.
|
|
116
|
+
return "cpu";
|
|
112
117
|
}
|
|
113
118
|
|
|
114
119
|
/**
|
|
@@ -158,13 +163,29 @@ export function checkBackendSupport(backend: Backend): string | undefined {
|
|
|
158
163
|
*/
|
|
159
164
|
export function checkMultimodalSupport(): string | undefined {
|
|
160
165
|
if (Platform.OS === "ios") {
|
|
161
|
-
return "Multimodal (image/audio) is
|
|
166
|
+
return "Multimodal (image/audio) is not available on iOS. The XCFramework lacks compiled vision and audio executor ops.";
|
|
162
167
|
}
|
|
163
168
|
return undefined;
|
|
164
169
|
}
|
|
165
170
|
|
|
166
171
|
/**
|
|
167
172
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
173
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
168
174
|
*/
|
|
169
175
|
export const GEMMA_3N_E2B_IT_INT4 =
|
|
170
176
|
"https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
180
|
+
* Public โ no HuggingFace account required.
|
|
181
|
+
*/
|
|
182
|
+
export const GEMMA_4_E2B_IT =
|
|
183
|
+
"https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
187
|
+
* Higher quality than E2B but requires more device memory.
|
|
188
|
+
* Public โ no HuggingFace account required.
|
|
189
|
+
*/
|
|
190
|
+
export const GEMMA_4_E4B_IT =
|
|
191
|
+
"https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
|
@@ -30,18 +30,18 @@ export interface LLMConfig {
|
|
|
30
30
|
|
|
31
31
|
/**
|
|
32
32
|
* Primary compute backend for text generation.
|
|
33
|
-
* - 'cpu': CPU inference (
|
|
34
|
-
* - 'gpu': GPU acceleration (fast,
|
|
33
|
+
* - 'cpu': CPU inference (safe default, always available)
|
|
34
|
+
* - 'gpu': GPU acceleration (fast, Metal on iOS, GPU delegate on Android)
|
|
35
35
|
* - 'npu': NPU/Neural Engine (fastest on supported devices)
|
|
36
36
|
*
|
|
37
|
-
* If not specified, defaults to '
|
|
37
|
+
* If not specified, defaults to 'cpu'.
|
|
38
38
|
* If specified backend is unavailable, falls back automatically.
|
|
39
39
|
*
|
|
40
40
|
* @remarks
|
|
41
|
-
* Vision encoder is always set to GPU (required by Gemma
|
|
41
|
+
* Vision encoder is always set to GPU (required by Gemma models).
|
|
42
42
|
* Audio encoder is always set to CPU (optimal for audio processing).
|
|
43
43
|
*
|
|
44
|
-
* @default '
|
|
44
|
+
* @default 'cpu'
|
|
45
45
|
*/
|
|
46
46
|
backend?: Backend;
|
|
47
47
|
|
|
@@ -116,12 +116,12 @@ export interface MemoryUsage {
|
|
|
116
116
|
|
|
117
117
|
/**
|
|
118
118
|
* LiteRT-LM: High-performance LLM inference engine.
|
|
119
|
-
* Supports Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
119
|
+
* Supports Gemma 4, Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
120
120
|
*
|
|
121
121
|
* @example
|
|
122
122
|
* ```typescript
|
|
123
123
|
* const llm = createLLM();
|
|
124
|
-
* llm.loadModel('/path/to/gemma-
|
|
124
|
+
* llm.loadModel('/path/to/gemma-4-E2B-it.litertlm', { backend: 'cpu' });
|
|
125
125
|
*
|
|
126
126
|
* // Blocking generation
|
|
127
127
|
* const response = llm.sendMessage('What is the capital of France?');
|