react-native-litert-lm 0.3.0 โ 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -26
- package/android/build.gradle +6 -2
- package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +26 -30
- package/app.plugin.js +28 -3
- package/cpp/HybridLiteRTLM.cpp +138 -58
- package/cpp/HybridLiteRTLM.hpp +2 -2
- package/cpp/include/litert_lm_engine.h +7 -0
- package/lib/index.d.ts +19 -2
- package/lib/index.js +23 -6
- package/lib/specs/LiteRTLM.nitro.d.ts +7 -7
- package/package.json +19 -13
- package/scripts/build-ios-engine.sh +20 -1
- package/scripts/download-ios-frameworks.sh +1 -1
- package/src/index.ts +26 -5
- package/src/specs/LiteRTLM.nitro.ts +7 -7
package/README.md
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
# react-native-litert-lm
|
|
2
2
|
|
|
3
|
-
High-performance on-device LLM inference for React Native, powered by [LiteRT-LM](https://github.com/google-ai-edge/LiteRT-LM) and [Nitro Modules](https://github.com/mrousavy/nitro). Optimized for **Gemma
|
|
3
|
+
High-performance on-device LLM inference for React Native, powered by [LiteRT-LM](https://github.com/google-ai-edge/LiteRT-LM) and [Nitro Modules](https://github.com/mrousavy/nitro). Optimized for **Gemma 4** and other on-device language models.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
7
|
- ๐ **Native Performance** โ Kotlin (Android) / C++ (iOS) via Nitro Modules JSI bindings
|
|
8
|
-
- ๐ง **Gemma
|
|
8
|
+
- ๐ง **Gemma 4 Ready** โ First-class support for Gemma 4 E2B/E4B multimodal models (text + vision + audio)
|
|
9
9
|
- โก **GPU Acceleration** โ GPU delegate (Android), Metal/MPS (iOS)
|
|
10
10
|
- ๐ **Streaming Support** โ Token-by-token generation callbacks
|
|
11
11
|
- ๐ฑ **Cross-Platform** โ Android API 26+ / iOS 15.0+
|
|
12
|
-
- ๐ผ๏ธ **Multimodal** โ Image and audio input support
|
|
13
|
-
- ๐งต **Async API** โ Non-blocking inference on
|
|
12
|
+
- ๐ผ๏ธ **Multimodal** โ Image and audio input support
|
|
13
|
+
- ๐งต **Async API** โ Non-blocking inference on dedicated large-stack threads
|
|
14
14
|
- ๐ **Real Memory Tracking** โ OS-level memory metrics (RSS, native heap, available memory) via native APIs
|
|
15
15
|
- ๐งฎ **Zero-Copy Buffers** โ Memory snapshots stored in native ArrayBuffers via Nitro Modules
|
|
16
16
|
- ๐ฅ **Automatic Model Download** โ Downloads models from URL with progress tracking and local caching
|
|
@@ -94,7 +94,7 @@ The `example/` directory contains a fully functional test app with a dark-themed
|
|
|
94
94
|
|
|
95
95
|
## Model Management
|
|
96
96
|
|
|
97
|
-
LiteRT-LM models (like Gemma
|
|
97
|
+
LiteRT-LM models (like Gemma 4) are large files (2โ4 GB) and cannot be bundled into your app binary. They are downloaded at runtime.
|
|
98
98
|
|
|
99
99
|
### Automatic Downloading
|
|
100
100
|
|
|
@@ -112,16 +112,15 @@ If you prefer to manage downloads yourself (e.g., using `expo-file-system`), dow
|
|
|
112
112
|
|
|
113
113
|
```typescript
|
|
114
114
|
import * as FileSystem from "expo-file-system";
|
|
115
|
+
import { GEMMA_4_E2B_IT } from "react-native-litert-lm";
|
|
115
116
|
|
|
116
|
-
const
|
|
117
|
-
"https://huggingface.co/litert-community/gemma-3n-2b-it/resolve/main/model.litertlm";
|
|
118
|
-
const localPath = `${FileSystem.documentDirectory}gemma-3n.litertlm`;
|
|
117
|
+
const localPath = `${FileSystem.documentDirectory}gemma-4-E2B-it.litertlm`;
|
|
119
118
|
|
|
120
119
|
async function downloadModel() {
|
|
121
120
|
const info = await FileSystem.getInfoAsync(localPath);
|
|
122
121
|
if (info.exists) return localPath;
|
|
123
122
|
|
|
124
|
-
await FileSystem.downloadAsync(
|
|
123
|
+
await FileSystem.downloadAsync(GEMMA_4_E2B_IT, localPath);
|
|
125
124
|
return localPath;
|
|
126
125
|
}
|
|
127
126
|
```
|
|
@@ -133,7 +132,7 @@ async function downloadModel() {
|
|
|
133
132
|
The `useModel` hook manages the full model lifecycle: downloading, loading, inference, and cleanup.
|
|
134
133
|
|
|
135
134
|
```typescript
|
|
136
|
-
import { useModel,
|
|
135
|
+
import { useModel, GEMMA_4_E2B_IT } from "react-native-litert-lm";
|
|
137
136
|
import { Platform } from "react-native";
|
|
138
137
|
|
|
139
138
|
function App() {
|
|
@@ -145,8 +144,8 @@ function App() {
|
|
|
145
144
|
load, // Manually trigger load
|
|
146
145
|
deleteModel, // Delete cached model file
|
|
147
146
|
memorySummary, // Auto-updated memory stats (if tracking enabled)
|
|
148
|
-
} = useModel(
|
|
149
|
-
backend:
|
|
147
|
+
} = useModel(GEMMA_4_E2B_IT, {
|
|
148
|
+
backend: 'cpu',
|
|
150
149
|
autoLoad: true, // Default: true. Set false to load manually via load().
|
|
151
150
|
systemPrompt: "You are a helpful assistant.",
|
|
152
151
|
enableMemoryTracking: true,
|
|
@@ -206,7 +205,7 @@ const warning = checkMultimodalSupport();
|
|
|
206
205
|
if (warning) {
|
|
207
206
|
console.warn(warning); // Experimental on iOS
|
|
208
207
|
} else {
|
|
209
|
-
// Image input (for vision models like Gemma
|
|
208
|
+
// Image input (for vision models like Gemma 4)
|
|
210
209
|
// Images >1024px are automatically resized to prevent OOM
|
|
211
210
|
const response = await llm.sendMessageWithImage(
|
|
212
211
|
"What's in this image?",
|
|
@@ -310,15 +309,20 @@ const buffer = tracker.getNativeBuffer();
|
|
|
310
309
|
|
|
311
310
|
Download `.litertlm` models automatically using the exported URL constants, or manually from [HuggingFace](https://huggingface.co/litert-community):
|
|
312
311
|
|
|
313
|
-
| Constant | Model
|
|
314
|
-
| :--------------------- |
|
|
315
|
-
| `
|
|
312
|
+
| Constant | Model | Size | Min RAM | Auth Required |
|
|
313
|
+
| :--------------------- | :--------------------------------- | :------- | :------ | :------------ |
|
|
314
|
+
| `GEMMA_4_E2B_IT` | Gemma 4 E2B (Multimodal, IT) | 2.58 GB | 4 GB+ | โ No |
|
|
315
|
+
| `GEMMA_4_E4B_IT` | Gemma 4 E4B (Higher Quality) | 3.65 GB | 6 GB+ | โ No |
|
|
316
|
+
| `GEMMA_3N_E2B_IT_INT4` | Gemma 3n E2B (Int4, Multimodal) | ~1.3 GB | 4 GB+ | โ
HuggingFace |
|
|
317
|
+
|
|
318
|
+
> **Recommended:** Use `GEMMA_4_E2B_IT` for most use cases. It's multimodal (text + vision + audio) and downloads directly from HuggingFace without requiring an account.
|
|
319
|
+
>
|
|
320
|
+
> **iOS Note:** Models larger than ~2 GB (like Gemma 4) require the `com.apple.developer.kernel.extended-virtual-addressing` entitlement. See [iOS Entitlements](#ios-entitlements) below.
|
|
316
321
|
|
|
317
322
|
**Other compatible models** (download manually from HuggingFace):
|
|
318
323
|
|
|
319
324
|
| Model | Size | Min RAM | Notes |
|
|
320
325
|
| ------------- | ------- | ------- | --------------------- |
|
|
321
|
-
| Gemma 3n E4B | ~4 GB | 8 GB+ | Higher quality |
|
|
322
326
|
| Gemma 3 1B | ~1 GB | 4 GB+ | Smallest, fastest |
|
|
323
327
|
| Phi-4 Mini | ~2 GB | 4 GB+ | Microsoft's small LLM |
|
|
324
328
|
| Qwen 2.5 1.5B | ~1.5 GB | 4 GB+ | Multilingual |
|
|
@@ -339,7 +343,7 @@ Loads a model from a local path or HTTPS URL.
|
|
|
339
343
|
| Parameter | Type | Default | Description |
|
|
340
344
|
| --------------------- | -------- | ------- | ----------------------------------------- |
|
|
341
345
|
| `path` | `string` | โ | Absolute path to `.litertlm` or HTTPS URL |
|
|
342
|
-
| `config.backend` | `string` | `'
|
|
346
|
+
| `config.backend` | `string` | `'cpu'` | `'cpu'`, `'gpu'`, or `'npu'` |
|
|
343
347
|
| `config.systemPrompt` | `string` | โ | System prompt for the model |
|
|
344
348
|
| `config.temperature` | `number` | `0.7` | Sampling temperature |
|
|
345
349
|
| `config.topK` | `number` | `40` | Top-K sampling |
|
|
@@ -354,7 +358,7 @@ Loads a model from a local path or HTTPS URL.
|
|
|
354
358
|
| `'gpu'` | GPU / Metal | Fast | Recommended default |
|
|
355
359
|
| `'npu'` | NPU / Neural Engine | Fastest | Requires supported hardware; falls back to GPU |
|
|
356
360
|
|
|
357
|
-
> **iOS**: `'gpu'`
|
|
361
|
+
> **iOS**: `'cpu'` is the recommended default backend. `'gpu'` (Metal/MPS) is also supported. The engine automatically tries multiple backend combinations if the primary one fails.
|
|
358
362
|
|
|
359
363
|
### `sendMessage(message): Promise<string>`
|
|
360
364
|
|
|
@@ -366,11 +370,11 @@ Streaming generation. Callback signature: `(token: string, isDone: boolean) => v
|
|
|
366
370
|
|
|
367
371
|
### `sendMessageWithImage(message, imagePath): Promise<string>`
|
|
368
372
|
|
|
369
|
-
Send a message with an image (
|
|
373
|
+
Send a message with an image (for vision models like Gemma 4 E2B).
|
|
370
374
|
|
|
371
375
|
### `sendMessageWithAudio(message, audioPath): Promise<string>`
|
|
372
376
|
|
|
373
|
-
Send a message with audio (
|
|
377
|
+
Send a message with audio (for audio-capable models like Gemma 4 E2B).
|
|
374
378
|
|
|
375
379
|
### `getStats(): GenerationStats`
|
|
376
380
|
|
|
@@ -448,8 +452,7 @@ const prompt = applyGemmaTemplate(
|
|
|
448
452
|
| react-native-nitro-modules | 0.35.0+ |
|
|
449
453
|
| Android API | 26+ (ARM64) |
|
|
450
454
|
| iOS | 15.0+ (ARM64) |
|
|
451
|
-
| LiteRT-LM
|
|
452
|
-
| LiteRT-LM iOS Engine | v0.9.0 |
|
|
455
|
+
| LiteRT-LM Engine | 0.10.1 |
|
|
453
456
|
|
|
454
457
|
## Platform Support
|
|
455
458
|
|
|
@@ -464,7 +467,8 @@ const prompt = applyGemmaTemplate(
|
|
|
464
467
|
| ---------------------------- | ------ | ----------------------------------------------------- |
|
|
465
468
|
| Text inference (blocking) | โ
| Via LiteRT-LM C API |
|
|
466
469
|
| Text inference (streaming) | โ
| Token-by-token callbacks |
|
|
467
|
-
|
|
|
470
|
+
| CPU inference | โ
| Recommended default backend |
|
|
471
|
+
| GPU inference (Metal/MPS) | โ
| Supported via `backend: 'gpu'` |
|
|
468
472
|
| Model download with progress | โ
| NSURLSession, cached in `Caches/` |
|
|
469
473
|
| Memory tracking | โ
| `mach_task_basic_info` |
|
|
470
474
|
| Multi-turn conversation | โ
| Context retained across turns |
|
|
@@ -472,6 +476,19 @@ const prompt = applyGemmaTemplate(
|
|
|
472
476
|
| Constrained decoding | โ | Requires llguidance Rust runtime |
|
|
473
477
|
| Function calling | โ | Requires Rust CXX bridge runtime |
|
|
474
478
|
|
|
479
|
+
### iOS Entitlements
|
|
480
|
+
|
|
481
|
+
Models larger than ~2 GB (like Gemma 4 E2B at 2.58 GB) require the **Extended Virtual Addressing** entitlement on iOS physical devices. Without it, iOS limits virtual memory to ~2 GB and the app will be killed by Jetsam.
|
|
482
|
+
|
|
483
|
+
Add to your app's `.entitlements` file:
|
|
484
|
+
|
|
485
|
+
```xml
|
|
486
|
+
<key>com.apple.developer.kernel.extended-virtual-addressing</key>
|
|
487
|
+
<true/>
|
|
488
|
+
```
|
|
489
|
+
|
|
490
|
+
> **Note:** This entitlement requires a **paid Apple Developer account** ($99/year). Gemma 3n E2B (~1.3 GB) works without it.
|
|
491
|
+
|
|
475
492
|
## Building the iOS Engine
|
|
476
493
|
|
|
477
494
|
The iOS build uses a **Bazel-to-XCFramework pipeline** that compiles the LiteRT-LM C engine and all transitive dependencies into a static library (~83 MB).
|
|
@@ -489,7 +506,7 @@ The iOS build uses a **Bazel-to-XCFramework pipeline** that compiles the LiteRT-
|
|
|
489
506
|
|
|
490
507
|
This will:
|
|
491
508
|
|
|
492
|
-
1. Clone/checkout LiteRT-LM `v0.
|
|
509
|
+
1. Clone/checkout LiteRT-LM `v0.10.1` source into `.litert-lm-build/`
|
|
493
510
|
2. Build `//c:engine` for `ios_arm64` and `ios_sim_arm64` via Bazel
|
|
494
511
|
3. Collect all transitive `.o` files (engine, protobuf, re2, sentencepiece, etc.)
|
|
495
512
|
4. Compile C/C++ stubs for unavailable Rust dependencies
|
|
@@ -541,7 +558,7 @@ Additionally, `PromptTemplate` is patched at build time to use a simplified C++
|
|
|
541
558
|
```
|
|
542
559
|
|
|
543
560
|
- **Android**: Kotlin (`HybridLiteRTLM.kt`) interfacing with the `litertlm-android` AAR.
|
|
544
|
-
- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM C API via a prebuilt `LiteRTLM.xcframework`. Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
|
|
561
|
+
- **iOS**: C++ (`HybridLiteRTLM.cpp`) interfacing with the LiteRT-LM C API via a prebuilt `LiteRTLM.xcframework`. All engine operations (load, inference, streaming) run on dedicated `pthread` threads with 8 MB stack to accommodate XNNPack's stack requirements. Platform-specific code (model downloading, file management) is in Objective-C++ (`ios/IOSDownloadHelper.mm`).
|
|
545
562
|
|
|
546
563
|
> **For contributors**: Changes to `cpp/HybridLiteRTLM.cpp` do not affect Android. Feature changes must be applied to both the Kotlin and C++ implementations.
|
|
547
564
|
|
package/android/build.gradle
CHANGED
|
@@ -9,9 +9,13 @@ plugins {
|
|
|
9
9
|
// Apply Nitrogen autolinking
|
|
10
10
|
apply from: '../nitrogen/generated/android/LiteRTLM+autolinking.gradle'
|
|
11
11
|
|
|
12
|
+
// Read LiteRT-LM SDK version from package.json (single source of truth)
|
|
13
|
+
def packageJson = new groovy.json.JsonSlurper().parseText(file('../package.json').text)
|
|
14
|
+
def litertLmVersion = packageJson.litertLm.androidMavenVersion
|
|
15
|
+
|
|
12
16
|
android {
|
|
13
17
|
namespace "dev.litert.litertlm"
|
|
14
|
-
compileSdk
|
|
18
|
+
compileSdk 36
|
|
15
19
|
|
|
16
20
|
defaultConfig {
|
|
17
21
|
minSdk 26 // LiteRT-LM requires API 26+
|
|
@@ -84,5 +88,5 @@ dependencies {
|
|
|
84
88
|
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3'
|
|
85
89
|
|
|
86
90
|
// LiteRT-LM Kotlin API
|
|
87
|
-
implementation
|
|
91
|
+
implementation "com.google.ai.edge.litertlm:litertlm-android:${litertLmVersion}"
|
|
88
92
|
}
|
|
@@ -44,8 +44,8 @@ internal class StreamingCallbackListener(
|
|
|
44
44
|
private val history: MutableList<Message>,
|
|
45
45
|
) : com.google.ai.edge.litertlm.MessageCallback {
|
|
46
46
|
|
|
47
|
-
override fun onMessage(responseMsg: com.google.ai.edge.litertlm.
|
|
48
|
-
val chunk = responseMsg.contents
|
|
47
|
+
override fun onMessage(responseMsg: com.google.ai.edge.litertlm.Message) {
|
|
48
|
+
val chunk = responseMsg.contents.contents
|
|
49
49
|
.filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
|
|
50
50
|
.joinToString("") { it.text }
|
|
51
51
|
|
|
@@ -123,7 +123,7 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
123
123
|
)
|
|
124
124
|
|
|
125
125
|
// Configuration
|
|
126
|
-
private var backend: Backend = Backend.
|
|
126
|
+
private var backend: Backend = Backend.CPU
|
|
127
127
|
private var temperature: Double = 0.7
|
|
128
128
|
private var topK: Int = 40
|
|
129
129
|
private var topP: Double = 0.95
|
|
@@ -161,21 +161,21 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
161
161
|
}
|
|
162
162
|
|
|
163
163
|
try {
|
|
164
|
-
// Map our Backend enum to LiteRT-LM Backend
|
|
164
|
+
// Map our Backend enum to LiteRT-LM Backend sealed class
|
|
165
165
|
val lmBackend = when (backend) {
|
|
166
|
-
Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
|
|
166
|
+
Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU()
|
|
167
167
|
Backend.NPU -> {
|
|
168
168
|
Log.i(TAG, "NPU backend requested - requires hardware support")
|
|
169
|
-
com.google.ai.edge.litertlm.Backend.NPU
|
|
169
|
+
com.google.ai.edge.litertlm.Backend.NPU()
|
|
170
170
|
}
|
|
171
|
-
else -> com.google.ai.edge.litertlm.Backend.CPU
|
|
171
|
+
else -> com.google.ai.edge.litertlm.Backend.CPU()
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
-
// Vision backend: hardcoded to GPU (required by Gemma
|
|
175
|
-
val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
|
|
174
|
+
// Vision backend: hardcoded to GPU (required by Gemma models)
|
|
175
|
+
val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU()
|
|
176
176
|
|
|
177
177
|
// Audio backend: hardcoded to CPU (optimal for audio processing)
|
|
178
|
-
val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
|
|
178
|
+
val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU()
|
|
179
179
|
|
|
180
180
|
Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
|
|
181
181
|
|
|
@@ -228,13 +228,13 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
228
228
|
Log.i(TAG, "sendMessage (Promise): $message")
|
|
229
229
|
|
|
230
230
|
// Blocking inference (safe here because we are in Promise.parallel worker thread)
|
|
231
|
-
val userMsg = LiteRTMessage.of(message)
|
|
231
|
+
val userMsg = LiteRTMessage.of(text = message)
|
|
232
232
|
val startTime = System.nanoTime()
|
|
233
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
233
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
234
234
|
val elapsedMs = (System.nanoTime() - startTime) / 1_000_000.0
|
|
235
235
|
|
|
236
236
|
// Extract text
|
|
237
|
-
val response = responseMsg.contents
|
|
237
|
+
val response = responseMsg.contents.contents
|
|
238
238
|
.filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
|
|
239
239
|
.joinToString("") { it.text }
|
|
240
240
|
|
|
@@ -242,6 +242,9 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
242
242
|
history.add(Message(Role.MODEL, response))
|
|
243
243
|
|
|
244
244
|
// Update stats with real timing data
|
|
245
|
+
// Token count heuristic: LiteRT-LM Android SDK does not expose
|
|
246
|
+
// actual token counts from inference. We approximate using
|
|
247
|
+
// ~4 chars/token. iOS uses the C API benchmark info for real counts.
|
|
245
248
|
val promptTokens = message.length / 4.0
|
|
246
249
|
val completionTokens = response.length / 4.0
|
|
247
250
|
lastStats = GenerationStats(
|
|
@@ -279,8 +282,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
279
282
|
)
|
|
280
283
|
|
|
281
284
|
try {
|
|
282
|
-
val userMsg = LiteRTMessage.of(message)
|
|
283
|
-
conversation!!.sendMessageAsync(userMsg, listener)
|
|
285
|
+
val userMsg = LiteRTMessage.of(text = message)
|
|
286
|
+
conversation!!.sendMessageAsync(message = userMsg, callback = listener)
|
|
284
287
|
} catch (e: Exception) {
|
|
285
288
|
Log.e(TAG, "Failed to initiate async generation", e)
|
|
286
289
|
onToken("Error: ${e.message}", true)
|
|
@@ -343,19 +346,14 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
343
346
|
// Use factory method Message.of passing a list of Content
|
|
344
347
|
val textContent = Content.Text(message)
|
|
345
348
|
|
|
346
|
-
val
|
|
347
|
-
textContent,
|
|
348
|
-
Content.ImageFile(processedImagePath)
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
val userMsg = LiteRTMessage.of(contentList)
|
|
349
|
+
val userMsg = LiteRTMessage.of(textContent, Content.ImageFile(processedImagePath))
|
|
352
350
|
|
|
353
351
|
// Add to history
|
|
354
352
|
history.add(Message(Role.USER, "$message [Image]"))
|
|
355
353
|
|
|
356
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
354
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
357
355
|
|
|
358
|
-
val response = responseMsg.contents
|
|
356
|
+
val response = responseMsg.contents.contents
|
|
359
357
|
.filterIsInstance<Content.Text>()
|
|
360
358
|
.joinToString("") { it.text }
|
|
361
359
|
|
|
@@ -490,18 +488,16 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
490
488
|
|
|
491
489
|
// Load audio
|
|
492
490
|
|
|
493
|
-
val
|
|
491
|
+
val userMsg = LiteRTMessage.of(
|
|
494
492
|
Content.Text(message),
|
|
495
493
|
Content.AudioFile(audioPath)
|
|
496
494
|
)
|
|
497
|
-
|
|
498
|
-
val userMsg = LiteRTMessage.of(contentList)
|
|
499
495
|
|
|
500
496
|
history.add(Message(Role.USER, "$message [Audio]"))
|
|
501
497
|
|
|
502
|
-
val responseMsg = conversation!!.sendMessage(userMsg)
|
|
498
|
+
val responseMsg = conversation!!.sendMessage(message = userMsg)
|
|
503
499
|
|
|
504
|
-
val response = responseMsg.contents
|
|
500
|
+
val response = responseMsg.contents.contents
|
|
505
501
|
.filterIsInstance<Content.Text>()
|
|
506
502
|
.joinToString("") { it.text }
|
|
507
503
|
|
|
@@ -628,8 +624,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
|
|
|
628
624
|
// Send system instruction as the first turn to prime the conversation.
|
|
629
625
|
// LiteRT-LM's Conversation API handles chat template formatting,
|
|
630
626
|
// including Gemma's <start_of_turn>system block.
|
|
631
|
-
val systemMsg = LiteRTMessage.of(
|
|
632
|
-
conversation!!.sendMessage(systemMsg)
|
|
627
|
+
val systemMsg = LiteRTMessage.of(Content.Text(prompt))
|
|
628
|
+
conversation!!.sendMessage(message = systemMsg)
|
|
633
629
|
Log.i(TAG, "System prompt applied (${prompt.length} chars)")
|
|
634
630
|
} catch (e: Exception) {
|
|
635
631
|
Log.w(TAG, "Failed to apply system prompt: ${e.message}")
|
package/app.plugin.js
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
* Expo config plugin for react-native-litert-lm.
|
|
3
3
|
*
|
|
4
4
|
* Ensures correct build settings for the LiteRT-LM native module:
|
|
5
|
-
* - Android: minSdkVersion 26,
|
|
6
|
-
* - iOS: deployment target 15.0
|
|
5
|
+
* - Android: minSdkVersion 26, Kotlin 2.3.0 (required by litertlm-android AAR)
|
|
7
6
|
*/
|
|
8
|
-
const {
|
|
7
|
+
const {
|
|
8
|
+
withGradleProperties,
|
|
9
|
+
withProjectBuildGradle,
|
|
10
|
+
} = require('@expo/config-plugins');
|
|
9
11
|
|
|
10
12
|
function withLiteRTLM(config) {
|
|
11
13
|
// Android: Ensure minSdkVersion is at least 26
|
|
@@ -27,6 +29,29 @@ function withLiteRTLM(config) {
|
|
|
27
29
|
return config;
|
|
28
30
|
});
|
|
29
31
|
|
|
32
|
+
// Android: Pin Kotlin Gradle plugin to 2.3.0
|
|
33
|
+
// The litertlm-android AAR uses Kotlin 2.3.0 metadata (version defined in
|
|
34
|
+
// package.json โ litertLm.androidMavenVersion).
|
|
35
|
+
// React Native's default Kotlin version (2.1.0) cannot read this metadata,
|
|
36
|
+
// so we must force the Kotlin Gradle plugin to 2.3.0 in the project-level
|
|
37
|
+
// build.gradle. This ensures the fix survives `expo prebuild --clean`.
|
|
38
|
+
config = withProjectBuildGradle(config, (config) => {
|
|
39
|
+
if (config.modResults.language === 'groovy') {
|
|
40
|
+
const contents = config.modResults.contents;
|
|
41
|
+
|
|
42
|
+
// Only add if not already pinned
|
|
43
|
+
if (!contents.includes("kotlin-gradle-plugin:2.3.0")) {
|
|
44
|
+
// Replace the unversioned kotlin-gradle-plugin classpath with a pinned one
|
|
45
|
+
config.modResults.contents = contents.replace(
|
|
46
|
+
"classpath('org.jetbrains.kotlin:kotlin-gradle-plugin')",
|
|
47
|
+
"classpath('org.jetbrains.kotlin:kotlin-gradle-plugin:2.3.0')"
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return config;
|
|
53
|
+
});
|
|
54
|
+
|
|
30
55
|
return config;
|
|
31
56
|
}
|
|
32
57
|
|
package/cpp/HybridLiteRTLM.cpp
CHANGED
|
@@ -18,16 +18,57 @@
|
|
|
18
18
|
#include <chrono>
|
|
19
19
|
#include <stdexcept>
|
|
20
20
|
#include <sstream>
|
|
21
|
+
#include <sys/stat.h>
|
|
22
|
+
#include <cstdio>
|
|
21
23
|
|
|
22
24
|
#ifdef __APPLE__
|
|
23
25
|
#include "IOSDownloadHelper.h"
|
|
26
|
+
#include <os/proc.h>
|
|
24
27
|
#endif
|
|
25
28
|
#include <fstream>
|
|
26
29
|
#include <thread>
|
|
27
30
|
#include <regex>
|
|
31
|
+
#include <pthread.h>
|
|
32
|
+
#include <functional>
|
|
28
33
|
|
|
29
34
|
namespace margelo::nitro::litertlm {
|
|
30
35
|
|
|
36
|
+
// =============================================================================
|
|
37
|
+
// Thread Helper โ LiteRT engine operations need >512KB stack (XNNPack, Metal)
|
|
38
|
+
// =============================================================================
|
|
39
|
+
|
|
40
|
+
static void runOnLargeStack(std::function<void()> work, size_t stackSize = 8 * 1024 * 1024) {
|
|
41
|
+
struct Context {
|
|
42
|
+
std::function<void()> fn;
|
|
43
|
+
std::exception_ptr exception;
|
|
44
|
+
};
|
|
45
|
+
Context ctx{std::move(work), nullptr};
|
|
46
|
+
|
|
47
|
+
pthread_t thread;
|
|
48
|
+
pthread_attr_t attr;
|
|
49
|
+
pthread_attr_init(&attr);
|
|
50
|
+
pthread_attr_setstacksize(&attr, stackSize);
|
|
51
|
+
|
|
52
|
+
int rc = pthread_create(&thread, &attr, [](void* arg) -> void* {
|
|
53
|
+
auto* c = static_cast<Context*>(arg);
|
|
54
|
+
try {
|
|
55
|
+
c->fn();
|
|
56
|
+
} catch (...) {
|
|
57
|
+
c->exception = std::current_exception();
|
|
58
|
+
}
|
|
59
|
+
return nullptr;
|
|
60
|
+
}, &ctx);
|
|
61
|
+
pthread_attr_destroy(&attr);
|
|
62
|
+
if (rc != 0) {
|
|
63
|
+
throw std::runtime_error("Failed to create large-stack thread (errno: " + std::to_string(rc) + ")");
|
|
64
|
+
}
|
|
65
|
+
pthread_join(thread, nullptr);
|
|
66
|
+
|
|
67
|
+
if (ctx.exception) {
|
|
68
|
+
std::rethrow_exception(ctx.exception);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
31
72
|
// =============================================================================
|
|
32
73
|
// JSON Helpers
|
|
33
74
|
// =============================================================================
|
|
@@ -189,7 +230,9 @@ std::shared_ptr<Promise<void>> HybridLiteRTLM::loadModel(
|
|
|
189
230
|
const std::string& modelPath,
|
|
190
231
|
const std::optional<LLMConfig>& config) {
|
|
191
232
|
return Promise<void>::async([this, modelPath, config]() {
|
|
192
|
-
|
|
233
|
+
runOnLargeStack([&]() {
|
|
234
|
+
loadModelInternal(modelPath, config);
|
|
235
|
+
});
|
|
193
236
|
});
|
|
194
237
|
}
|
|
195
238
|
|
|
@@ -241,7 +284,7 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
241
284
|
modelPath.c_str(),
|
|
242
285
|
backend,
|
|
243
286
|
visionBackend,
|
|
244
|
-
|
|
287
|
+
nullptr // audio executor not supported on iOS yet
|
|
245
288
|
);
|
|
246
289
|
if (!settings) {
|
|
247
290
|
return false;
|
|
@@ -250,6 +293,10 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
250
293
|
litert_lm_engine_settings_set_max_num_tokens(settings, static_cast<int>(maxTokens_));
|
|
251
294
|
litert_lm_engine_settings_enable_benchmark(settings);
|
|
252
295
|
|
|
296
|
+
// Set cache directory to the same directory as the model file
|
|
297
|
+
std::string cacheDir = modelPath.substr(0, modelPath.find_last_of('/'));
|
|
298
|
+
litert_lm_engine_settings_set_cache_dir(settings, cacheDir.c_str());
|
|
299
|
+
|
|
253
300
|
engine_ = litert_lm_engine_create(settings);
|
|
254
301
|
litert_lm_engine_settings_delete(settings);
|
|
255
302
|
|
|
@@ -275,9 +322,32 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
275
322
|
}
|
|
276
323
|
|
|
277
324
|
if (!engine_) {
|
|
325
|
+
// Collect diagnostic info
|
|
326
|
+
std::string diag = " | Diagnostics: ";
|
|
327
|
+
struct stat st;
|
|
328
|
+
if (stat(modelPath.c_str(), &st) == 0) {
|
|
329
|
+
diag += "File size: " + std::to_string(st.st_size) + " bytes";
|
|
330
|
+
} else {
|
|
331
|
+
diag += "Failed to stat file (errno: " + std::to_string(errno) + ")";
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
FILE* f = fopen(modelPath.c_str(), "rb");
|
|
335
|
+
if (f) {
|
|
336
|
+
diag += ", Readable: YES";
|
|
337
|
+
fclose(f);
|
|
338
|
+
} else {
|
|
339
|
+
diag += ", Readable: NO (errno: " + std::to_string(errno) + ")";
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Get the native error from the C API
|
|
343
|
+
const char* nativeErr = litert_lm_get_last_error();
|
|
344
|
+
if (nativeErr && nativeErr[0] != '\0') {
|
|
345
|
+
diag += " | Native error: " + std::string(nativeErr);
|
|
346
|
+
}
|
|
347
|
+
|
|
278
348
|
throw std::runtime_error(
|
|
279
349
|
"Failed to create LiteRT-LM engine. Tried backend '" +
|
|
280
|
-
std::string(primaryBackend) + "' and CPU fallback. Model path: " + modelPath);
|
|
350
|
+
std::string(primaryBackend) + "' and CPU fallback. Model path: " + modelPath + diag);
|
|
281
351
|
}
|
|
282
352
|
|
|
283
353
|
session_config_ = litert_lm_session_config_create();
|
|
@@ -307,7 +377,11 @@ void HybridLiteRTLM::loadModelInternal(
|
|
|
307
377
|
|
|
308
378
|
std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessage(const std::string& message) {
|
|
309
379
|
return Promise<std::string>::async([this, message]() -> std::string {
|
|
310
|
-
|
|
380
|
+
std::string result;
|
|
381
|
+
runOnLargeStack([&]() {
|
|
382
|
+
result = sendMessageInternal(message);
|
|
383
|
+
});
|
|
384
|
+
return result;
|
|
311
385
|
});
|
|
312
386
|
}
|
|
313
387
|
|
|
@@ -416,34 +490,42 @@ void HybridLiteRTLM::sendMessageAsync(
|
|
|
416
490
|
auto onTokenCopy = onToken;
|
|
417
491
|
auto messageCopy = message;
|
|
418
492
|
|
|
419
|
-
// Capture shared state safely
|
|
420
|
-
auto
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
493
|
+
// Capture shared state safely โ use unique_ptr to prevent leaks
|
|
494
|
+
auto ctxOwner = std::make_unique<StreamContext>();
|
|
495
|
+
ctxOwner->onToken = std::move(onTokenCopy);
|
|
496
|
+
ctxOwner->fullResponse = "";
|
|
497
|
+
ctxOwner->history = &history_;
|
|
498
|
+
ctxOwner->historyMutex = &mutex_;
|
|
499
|
+
ctxOwner->userMessage = messageCopy;
|
|
500
|
+
ctxOwner->lastStats = &lastStats_;
|
|
501
|
+
ctxOwner->startTime = std::chrono::steady_clock::now();
|
|
502
|
+
ctxOwner->tokenCount = 0;
|
|
429
503
|
|
|
430
504
|
#ifdef __APPLE__
|
|
431
505
|
ensureLoaded();
|
|
432
506
|
|
|
433
507
|
std::string msgJson = buildTextMessageJson(messageCopy);
|
|
434
508
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
509
|
+
// Release ownership โ the C callback now owns the context via raw pointer.
|
|
510
|
+
// streamCallbackFn will delete it when done or on error.
|
|
511
|
+
StreamContext* ctx = ctxOwner.release();
|
|
438
512
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
513
|
+
// Wrap the initial engine call in runOnLargeStack for consistency
|
|
514
|
+
// with all other engine entry points (XNNPack needs >512KB stack).
|
|
515
|
+
runOnLargeStack([&]() {
|
|
516
|
+
int result = litert_lm_conversation_send_message_stream(
|
|
517
|
+
conversation_, msgJson.c_str(), nullptr,
|
|
518
|
+
streamCallbackFn, ctx);
|
|
519
|
+
|
|
520
|
+
if (result != 0) {
|
|
521
|
+
delete ctx;
|
|
522
|
+
throw std::runtime_error("LiteRT-LM: Failed to start streaming inference");
|
|
523
|
+
}
|
|
524
|
+
});
|
|
443
525
|
#else
|
|
444
526
|
// Non-Apple stub
|
|
445
|
-
|
|
446
|
-
|
|
527
|
+
ctxOwner->onToken("[iOS only] Streaming not available on this platform.", true);
|
|
528
|
+
// ctxOwner auto-deleted by unique_ptr
|
|
447
529
|
#endif
|
|
448
530
|
}
|
|
449
531
|
|
|
@@ -455,7 +537,11 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithImage(
|
|
|
455
537
|
const std::string& message,
|
|
456
538
|
const std::string& imagePath) {
|
|
457
539
|
return Promise<std::string>::async([this, message, imagePath]() -> std::string {
|
|
458
|
-
|
|
540
|
+
std::string result;
|
|
541
|
+
runOnLargeStack([&]() {
|
|
542
|
+
result = sendMessageWithImageInternal(message, imagePath);
|
|
543
|
+
});
|
|
544
|
+
return result;
|
|
459
545
|
});
|
|
460
546
|
}
|
|
461
547
|
|
|
@@ -484,7 +570,12 @@ std::string HybridLiteRTLM::sendMessageWithImageInternal(
|
|
|
484
570
|
conversation_, msgJson.c_str(), nullptr);
|
|
485
571
|
|
|
486
572
|
if (!response) {
|
|
487
|
-
|
|
573
|
+
std::string errMsg = "LiteRT-LM: sendMessageWithImage failed";
|
|
574
|
+
const char* nativeErr = litert_lm_get_last_error();
|
|
575
|
+
if (nativeErr && nativeErr[0] != '\0') {
|
|
576
|
+
errMsg += ": " + std::string(nativeErr);
|
|
577
|
+
}
|
|
578
|
+
throw std::runtime_error(errMsg);
|
|
488
579
|
}
|
|
489
580
|
|
|
490
581
|
const char* responseStr = litert_lm_json_response_get_string(response);
|
|
@@ -513,7 +604,11 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithAudio(
|
|
|
513
604
|
const std::string& message,
|
|
514
605
|
const std::string& audioPath) {
|
|
515
606
|
return Promise<std::string>::async([this, message, audioPath]() -> std::string {
|
|
516
|
-
|
|
607
|
+
std::string result;
|
|
608
|
+
runOnLargeStack([&]() {
|
|
609
|
+
result = sendMessageWithAudioInternal(message, audioPath);
|
|
610
|
+
});
|
|
611
|
+
return result;
|
|
517
612
|
});
|
|
518
613
|
}
|
|
519
614
|
|
|
@@ -573,16 +668,8 @@ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::downloadModel(
|
|
|
573
668
|
#ifdef __APPLE__
|
|
574
669
|
return litert_lm::downloadModelFile(url, fileName, onProgress);
|
|
575
670
|
#else
|
|
576
|
-
|
|
577
|
-
std::
|
|
578
|
-
int result = system(curlCmd.c_str());
|
|
579
|
-
if (result != 0) {
|
|
580
|
-
throw std::runtime_error("Failed to download model from: " + url);
|
|
581
|
-
}
|
|
582
|
-
if (onProgress.has_value()) {
|
|
583
|
-
onProgress.value()(1.0);
|
|
584
|
-
}
|
|
585
|
-
return destPath;
|
|
671
|
+
// Non-Apple platforms: not supported from C++ (Android uses Kotlin)
|
|
672
|
+
throw std::runtime_error("Download not available on this platform. Use the Kotlin implementation.");
|
|
586
673
|
#endif
|
|
587
674
|
});
|
|
588
675
|
}
|
|
@@ -654,8 +741,8 @@ GenerationStats HybridLiteRTLM::getStats() {
|
|
|
654
741
|
// =============================================================================
|
|
655
742
|
|
|
656
743
|
MemoryUsage HybridLiteRTLM::getMemoryUsage() {
|
|
657
|
-
double
|
|
658
|
-
double
|
|
744
|
+
double nativeHeapBytes = 0;
|
|
745
|
+
double residentBytes = 0;
|
|
659
746
|
double availableBytes = 0;
|
|
660
747
|
bool isLowMemory = false;
|
|
661
748
|
|
|
@@ -670,33 +757,26 @@ MemoryUsage HybridLiteRTLM::getMemoryUsage() {
|
|
|
670
757
|
&count);
|
|
671
758
|
|
|
672
759
|
if (kr == KERN_SUCCESS) {
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
mach_port_t host_port = mach_host_self();
|
|
678
|
-
struct host_basic_info hostInfo;
|
|
679
|
-
mach_msg_type_number_t hostCount = HOST_BASIC_INFO_COUNT;
|
|
680
|
-
|
|
681
|
-
kr = host_info(host_port, HOST_BASIC_INFO,
|
|
682
|
-
(host_info_t)&hostInfo, &hostCount);
|
|
683
|
-
|
|
684
|
-
if (kr == KERN_SUCCESS) {
|
|
685
|
-
totalMemoryBytes = static_cast<double>(hostInfo.max_mem);
|
|
760
|
+
residentBytes = static_cast<double>(info.resident_size);
|
|
761
|
+
// On iOS, mach_task_basic_info doesn't separate heap from RSS.
|
|
762
|
+
// Use resident_size_max as a proxy for peak native allocation.
|
|
763
|
+
nativeHeapBytes = static_cast<double>(info.resident_size);
|
|
686
764
|
}
|
|
687
765
|
|
|
688
|
-
|
|
689
|
-
|
|
766
|
+
// Use os_proc_available_memory() (iOS 13+) for accurate Jetsam headroom.
|
|
767
|
+
// This reports how much memory the process can still allocate before
|
|
768
|
+
// the system kills it โ far more accurate than total_physical - process_rss.
|
|
769
|
+
availableBytes = static_cast<double>(os_proc_available_memory());
|
|
690
770
|
|
|
691
771
|
// Low memory threshold (~200MB available)
|
|
692
|
-
isLowMemory =
|
|
772
|
+
isLowMemory = availableBytes < 200.0 * 1024.0 * 1024.0;
|
|
693
773
|
#endif
|
|
694
774
|
|
|
695
775
|
return MemoryUsage{
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
availableBytes,
|
|
699
|
-
isLowMemory
|
|
776
|
+
nativeHeapBytes, // nativeHeapBytes (RSS as proxy on iOS)
|
|
777
|
+
residentBytes, // residentBytes
|
|
778
|
+
availableBytes, // availableMemoryBytes
|
|
779
|
+
isLowMemory // isLowMemory
|
|
700
780
|
};
|
|
701
781
|
}
|
|
702
782
|
|
package/cpp/HybridLiteRTLM.hpp
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
// react-native-litert-lm
|
|
4
4
|
//
|
|
5
5
|
// High-performance LLM inference using LiteRT-LM.
|
|
6
|
-
// Supports Gemma 3n and other .litertlm models.
|
|
6
|
+
// Supports Gemma 4, Gemma 3n, and other .litertlm models.
|
|
7
7
|
//
|
|
8
8
|
// NOTE: This C++ implementation is used for iOS ONLY.
|
|
9
9
|
// Android uses the Kotlin implementation in `android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt`.
|
|
@@ -112,7 +112,7 @@ private:
|
|
|
112
112
|
mutable std::mutex mutex_;
|
|
113
113
|
|
|
114
114
|
// Configuration - backend
|
|
115
|
-
Backend backend_ = Backend::
|
|
115
|
+
Backend backend_ = Backend::CPU;
|
|
116
116
|
|
|
117
117
|
// System prompt / instruction
|
|
118
118
|
std::string systemPrompt_;
|
|
@@ -232,11 +232,18 @@ LITERT_LM_C_API_EXPORT
|
|
|
232
232
|
void litert_lm_engine_settings_set_num_decode_tokens(
|
|
233
233
|
LiteRtLmEngineSettings* settings, int num_decode_tokens);
|
|
234
234
|
|
|
235
|
+
// Returns the last error message from a failed C API call.
|
|
236
|
+
// Returns an empty string if no error has occurred.
|
|
237
|
+
// The returned pointer is valid until the next C API call on the same thread.
|
|
238
|
+
LITERT_LM_C_API_EXPORT
|
|
239
|
+
const char* litert_lm_get_last_error();
|
|
240
|
+
|
|
235
241
|
// Creates a LiteRT LM Engine from the given settings. The caller is responsible
|
|
236
242
|
// for destroying the engine using `litert_lm_engine_delete`.
|
|
237
243
|
//
|
|
238
244
|
// @param settings The engine settings.
|
|
239
245
|
// @return A pointer to the created engine, or NULL on failure.
|
|
246
|
+
// Call litert_lm_get_last_error() for details on failure.
|
|
240
247
|
LITERT_LM_C_API_EXPORT
|
|
241
248
|
LiteRtLmEngine* litert_lm_engine_create(const LiteRtLmEngineSettings* settings);
|
|
242
249
|
|
package/lib/index.d.ts
CHANGED
|
@@ -45,6 +45,10 @@ export { createLLM } from "./modelFactory";
|
|
|
45
45
|
* Use with model download utilities or as reference.
|
|
46
46
|
*/
|
|
47
47
|
export declare const Models: {
|
|
48
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
49
|
+
readonly GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm";
|
|
50
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
51
|
+
readonly GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm";
|
|
48
52
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
49
53
|
readonly GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview";
|
|
50
54
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -59,9 +63,10 @@ export declare const Models: {
|
|
|
59
63
|
export type ModelId = (typeof Models)[keyof typeof Models];
|
|
60
64
|
/**
|
|
61
65
|
* Get the recommended backend for the current platform.
|
|
62
|
-
* Returns '
|
|
66
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
67
|
+
* is faster but may not be available on all devices or model configurations.
|
|
63
68
|
*
|
|
64
|
-
* @returns The recommended backend ('
|
|
69
|
+
* @returns The recommended backend ('cpu')
|
|
65
70
|
*
|
|
66
71
|
* @example
|
|
67
72
|
* ```typescript
|
|
@@ -106,5 +111,17 @@ export declare function checkBackendSupport(backend: Backend): string | undefine
|
|
|
106
111
|
export declare function checkMultimodalSupport(): string | undefined;
|
|
107
112
|
/**
|
|
108
113
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
114
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
109
115
|
*/
|
|
110
116
|
export declare const GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
117
|
+
/**
|
|
118
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
119
|
+
* Public โ no HuggingFace account required.
|
|
120
|
+
*/
|
|
121
|
+
export declare const GEMMA_4_E2B_IT = "https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
122
|
+
/**
|
|
123
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
124
|
+
* Higher quality than E2B but requires more device memory.
|
|
125
|
+
* Public โ no HuggingFace account required.
|
|
126
|
+
*/
|
|
127
|
+
export declare const GEMMA_4_E4B_IT = "https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
package/lib/index.js
CHANGED
|
@@ -14,7 +14,7 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
-
exports.GEMMA_3N_E2B_IT_INT4 = exports.Models = exports.createLLM = exports.createNativeBuffer = exports.createMemoryTracker = exports.applyLlamaTemplate = exports.applyPhiTemplate = exports.applyGemmaTemplate = void 0;
|
|
17
|
+
exports.GEMMA_4_E4B_IT = exports.GEMMA_4_E2B_IT = exports.GEMMA_3N_E2B_IT_INT4 = exports.Models = exports.createLLM = exports.createNativeBuffer = exports.createMemoryTracker = exports.applyLlamaTemplate = exports.applyPhiTemplate = exports.applyGemmaTemplate = void 0;
|
|
18
18
|
exports.getRecommendedBackend = getRecommendedBackend;
|
|
19
19
|
exports.checkBackendSupport = checkBackendSupport;
|
|
20
20
|
exports.checkMultimodalSupport = checkMultimodalSupport;
|
|
@@ -67,6 +67,10 @@ Object.defineProperty(exports, "createLLM", { enumerable: true, get: function ()
|
|
|
67
67
|
* Use with model download utilities or as reference.
|
|
68
68
|
*/
|
|
69
69
|
exports.Models = {
|
|
70
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
71
|
+
GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm",
|
|
72
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
73
|
+
GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm",
|
|
70
74
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
71
75
|
GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview",
|
|
72
76
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -80,9 +84,10 @@ exports.Models = {
|
|
|
80
84
|
};
|
|
81
85
|
/**
|
|
82
86
|
* Get the recommended backend for the current platform.
|
|
83
|
-
* Returns '
|
|
87
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
88
|
+
* is faster but may not be available on all devices or model configurations.
|
|
84
89
|
*
|
|
85
|
-
* @returns The recommended backend ('
|
|
90
|
+
* @returns The recommended backend ('cpu')
|
|
86
91
|
*
|
|
87
92
|
* @example
|
|
88
93
|
* ```typescript
|
|
@@ -91,9 +96,9 @@ exports.Models = {
|
|
|
91
96
|
* ```
|
|
92
97
|
*/
|
|
93
98
|
function getRecommendedBackend() {
|
|
94
|
-
//
|
|
95
|
-
//
|
|
96
|
-
return "
|
|
99
|
+
// CPU is the safe default โ always available, broadly compatible.
|
|
100
|
+
// GPU is faster but may fail on some models/devices.
|
|
101
|
+
return "cpu";
|
|
97
102
|
}
|
|
98
103
|
/**
|
|
99
104
|
* Check if a backend configuration is supported on the current platform.
|
|
@@ -146,5 +151,17 @@ function checkMultimodalSupport() {
|
|
|
146
151
|
}
|
|
147
152
|
/**
|
|
148
153
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
154
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
149
155
|
*/
|
|
150
156
|
exports.GEMMA_3N_E2B_IT_INT4 = "https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
157
|
+
/**
|
|
158
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
159
|
+
* Public โ no HuggingFace account required.
|
|
160
|
+
*/
|
|
161
|
+
exports.GEMMA_4_E2B_IT = "https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
162
|
+
/**
|
|
163
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
164
|
+
* Higher quality than E2B but requires more device memory.
|
|
165
|
+
* Public โ no HuggingFace account required.
|
|
166
|
+
*/
|
|
167
|
+
exports.GEMMA_4_E4B_IT = "https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
|
@@ -26,18 +26,18 @@ export interface LLMConfig {
|
|
|
26
26
|
systemPrompt?: string;
|
|
27
27
|
/**
|
|
28
28
|
* Primary compute backend for text generation.
|
|
29
|
-
* - 'cpu': CPU inference (
|
|
30
|
-
* - 'gpu': GPU acceleration (fast,
|
|
29
|
+
* - 'cpu': CPU inference (safe default, always available)
|
|
30
|
+
* - 'gpu': GPU acceleration (fast, Metal on iOS, GPU delegate on Android)
|
|
31
31
|
* - 'npu': NPU/Neural Engine (fastest on supported devices)
|
|
32
32
|
*
|
|
33
|
-
* If not specified, defaults to '
|
|
33
|
+
* If not specified, defaults to 'cpu'.
|
|
34
34
|
* If specified backend is unavailable, falls back automatically.
|
|
35
35
|
*
|
|
36
36
|
* @remarks
|
|
37
|
-
* Vision encoder is always set to GPU (required by Gemma
|
|
37
|
+
* Vision encoder is always set to GPU (required by Gemma models).
|
|
38
38
|
* Audio encoder is always set to CPU (optimal for audio processing).
|
|
39
39
|
*
|
|
40
|
-
* @default '
|
|
40
|
+
* @default 'cpu'
|
|
41
41
|
*/
|
|
42
42
|
backend?: Backend;
|
|
43
43
|
/**
|
|
@@ -104,12 +104,12 @@ export interface MemoryUsage {
|
|
|
104
104
|
}
|
|
105
105
|
/**
|
|
106
106
|
* LiteRT-LM: High-performance LLM inference engine.
|
|
107
|
-
* Supports Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
107
|
+
* Supports Gemma 4, Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
108
108
|
*
|
|
109
109
|
* @example
|
|
110
110
|
* ```typescript
|
|
111
111
|
* const llm = createLLM();
|
|
112
|
-
* llm.loadModel('/path/to/gemma-
|
|
112
|
+
* llm.loadModel('/path/to/gemma-4-E2B-it.litertlm', { backend: 'cpu' });
|
|
113
113
|
*
|
|
114
114
|
* // Blocking generation
|
|
115
115
|
* const response = llm.sendMessage('What is the capital of France?');
|
package/package.json
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "react-native-litert-lm",
|
|
3
|
-
"version": "0.3.
|
|
4
|
-
"
|
|
3
|
+
"version": "0.3.2",
|
|
4
|
+
"litertLm": {
|
|
5
|
+
"version": "0.10.1",
|
|
6
|
+
"androidMavenVersion": "0.10.0",
|
|
7
|
+
"iosGitTag": "v0.10.1"
|
|
8
|
+
},
|
|
9
|
+
"description": "High-performance LLM inference for React Native using LiteRT-LM. Optimized for Gemma 4 and other on-device language models.",
|
|
5
10
|
"license": "MIT",
|
|
6
11
|
"author": "Hugh Chen (https://github.com/hung-yueh)",
|
|
7
12
|
"repository": {
|
|
@@ -19,6 +24,7 @@
|
|
|
19
24
|
"litert-lm",
|
|
20
25
|
"llm",
|
|
21
26
|
"gemma",
|
|
27
|
+
"gemma-4",
|
|
22
28
|
"gemma-3n",
|
|
23
29
|
"ai",
|
|
24
30
|
"machine-learning",
|
|
@@ -69,26 +75,26 @@
|
|
|
69
75
|
"release": "release-it"
|
|
70
76
|
},
|
|
71
77
|
"devDependencies": {
|
|
72
|
-
"@expo/config-plugins": "~
|
|
73
|
-
"@types/react": "~19.
|
|
74
|
-
"expo": "^54.0.31",
|
|
75
|
-
"nitrogen": "^0.35.0",
|
|
76
|
-
"react": "19.1.0",
|
|
77
|
-
"react-native": "0.81.5",
|
|
78
|
+
"@expo/config-plugins": "~55.0.0",
|
|
79
|
+
"@types/react": "~19.2.10",
|
|
78
80
|
"release-it": "^19.2.4",
|
|
79
81
|
"typescript": "^5.0.0"
|
|
80
82
|
},
|
|
81
83
|
"peerDependencies": {
|
|
82
|
-
"expo": ">=
|
|
84
|
+
"expo": ">=55.0.0",
|
|
83
85
|
"react": "*",
|
|
84
|
-
"react-native": "*"
|
|
86
|
+
"react-native": "*",
|
|
87
|
+
"react-native-nitro-modules": "^0.35.0"
|
|
85
88
|
},
|
|
86
89
|
"peerDependenciesMeta": {
|
|
87
90
|
"expo": {
|
|
88
91
|
"optional": true
|
|
92
|
+
},
|
|
93
|
+
"react": {
|
|
94
|
+
"optional": true
|
|
95
|
+
},
|
|
96
|
+
"react-native": {
|
|
97
|
+
"optional": true
|
|
89
98
|
}
|
|
90
|
-
},
|
|
91
|
-
"dependencies": {
|
|
92
|
-
"react-native-nitro-modules": "^0.35.0"
|
|
93
99
|
}
|
|
94
100
|
}
|
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
|
|
17
17
|
set -euo pipefail
|
|
18
18
|
|
|
19
|
-
LITERT_LM_VERSION="v0.9.0"
|
|
20
19
|
LITERT_LM_REPO="https://github.com/google-ai-edge/LiteRT-LM.git"
|
|
21
20
|
FRAMEWORK_NAME="LiteRTLM"
|
|
22
21
|
|
|
23
22
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
24
23
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
24
|
+
LITERT_LM_VERSION="$(node -e "console.log(require('$PROJECT_ROOT/package.json').litertLm.iosGitTag)")"
|
|
25
25
|
OUTPUT_DIR="$PROJECT_ROOT/ios/Frameworks"
|
|
26
26
|
C_API_HEADER_DIR="$PROJECT_ROOT/cpp/include"
|
|
27
27
|
BUILD_DIR="$PROJECT_ROOT/.litert-lm-build"
|
|
@@ -57,6 +57,25 @@ fi
|
|
|
57
57
|
|
|
58
58
|
LITERT_SRC="$BUILD_DIR/LiteRT-LM"
|
|
59
59
|
|
|
60
|
+
# ---- 1b. Apply iOS-specific patches ---------------------------------------
|
|
61
|
+
# These patches fix:
|
|
62
|
+
# - mmap PROT_WRITE removal (iOS rejects CoW for large files)
|
|
63
|
+
# - Error capture API (litert_lm_get_last_error)
|
|
64
|
+
# - Engine registerer moved outside anonymous namespace (iOS linker stripping)
|
|
65
|
+
# - Minijinja/Rust stub replacement (custom C++ prompt template)
|
|
66
|
+
PATCHES_DIR="$PROJECT_ROOT/scripts/patches"
|
|
67
|
+
if [ -d "$PATCHES_DIR" ]; then
|
|
68
|
+
for PATCH_FILE in "$PATCHES_DIR"/*.patch; do
|
|
69
|
+
if [ -f "$PATCH_FILE" ]; then
|
|
70
|
+
echo " Applying patch: $(basename "$PATCH_FILE")..."
|
|
71
|
+
cd "$LITERT_SRC"
|
|
72
|
+
git apply --check "$PATCH_FILE" 2>/dev/null && \
|
|
73
|
+
git apply "$PATCH_FILE" || \
|
|
74
|
+
echo " (patch already applied or conflicts, skipping)"
|
|
75
|
+
fi
|
|
76
|
+
done
|
|
77
|
+
fi
|
|
78
|
+
|
|
60
79
|
# ---- 2. Verify Bazel is available -----------------------------------------
|
|
61
80
|
echo ""
|
|
62
81
|
echo "==> Step 2: Checking Bazel..."
|
|
@@ -19,7 +19,7 @@ PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
|
19
19
|
OUTPUT_DIR="$PROJECT_ROOT/ios/Frameworks"
|
|
20
20
|
C_API_HEADER_DIR="$PROJECT_ROOT/cpp/include"
|
|
21
21
|
|
|
22
|
-
LITERT_LM_VERSION="
|
|
22
|
+
LITERT_LM_VERSION="$(node -e "console.log(require('$PROJECT_ROOT/package.json').litertLm.iosGitTag)")"
|
|
23
23
|
GITHUB_RAW="https://github.com/google-ai-edge/LiteRT-LM/raw/${LITERT_LM_VERSION}"
|
|
24
24
|
|
|
25
25
|
# Read version from package.json
|
package/src/index.ts
CHANGED
|
@@ -79,6 +79,10 @@ export { createLLM } from "./modelFactory";
|
|
|
79
79
|
* Use with model download utilities or as reference.
|
|
80
80
|
*/
|
|
81
81
|
export const Models = {
|
|
82
|
+
/** Gemma 4 E2B Instruct (2B parameters, latest generation) */
|
|
83
|
+
GEMMA_4_E2B: "gemma-4-E2B-it-litert-lm",
|
|
84
|
+
/** Gemma 4 E4B Instruct (4B parameters, higher quality) */
|
|
85
|
+
GEMMA_4_E4B: "gemma-4-E4B-it-litert-lm",
|
|
82
86
|
/** Gemma 3n E2B (2B parameters, efficient) */
|
|
83
87
|
GEMMA_3N_E2B: "gemma-3n-E2B-it-litert-lm-preview",
|
|
84
88
|
/** Gemma 3n E4B (4B parameters, higher quality) */
|
|
@@ -95,9 +99,10 @@ export type ModelId = (typeof Models)[keyof typeof Models];
|
|
|
95
99
|
|
|
96
100
|
/**
|
|
97
101
|
* Get the recommended backend for the current platform.
|
|
98
|
-
* Returns '
|
|
102
|
+
* Returns 'cpu' as the safe default. GPU (Metal on iOS, GPU delegate on Android)
|
|
103
|
+
* is faster but may not be available on all devices or model configurations.
|
|
99
104
|
*
|
|
100
|
-
* @returns The recommended backend ('
|
|
105
|
+
* @returns The recommended backend ('cpu')
|
|
101
106
|
*
|
|
102
107
|
* @example
|
|
103
108
|
* ```typescript
|
|
@@ -106,9 +111,9 @@ export type ModelId = (typeof Models)[keyof typeof Models];
|
|
|
106
111
|
* ```
|
|
107
112
|
*/
|
|
108
113
|
export function getRecommendedBackend(): Backend {
|
|
109
|
-
//
|
|
110
|
-
//
|
|
111
|
-
return "
|
|
114
|
+
// CPU is the safe default โ always available, broadly compatible.
|
|
115
|
+
// GPU is faster but may fail on some models/devices.
|
|
116
|
+
return "cpu";
|
|
112
117
|
}
|
|
113
118
|
|
|
114
119
|
/**
|
|
@@ -165,6 +170,22 @@ export function checkMultimodalSupport(): string | undefined {
|
|
|
165
170
|
|
|
166
171
|
/**
|
|
167
172
|
* Download URL for the Gemma 3n E2B IT INT4 model.
|
|
173
|
+
* Note: Requires a HuggingFace account (gated model).
|
|
168
174
|
*/
|
|
169
175
|
export const GEMMA_3N_E2B_IT_INT4 =
|
|
170
176
|
"https://litert.dev/gemma-3n-E2B-it-int4.litertlm";
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Download URL for the Gemma 4 E2B IT model (2.58 GB).
|
|
180
|
+
* Public โ no HuggingFace account required.
|
|
181
|
+
*/
|
|
182
|
+
export const GEMMA_4_E2B_IT =
|
|
183
|
+
"https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm";
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Download URL for the Gemma 4 E4B IT model (3.65 GB).
|
|
187
|
+
* Higher quality than E2B but requires more device memory.
|
|
188
|
+
* Public โ no HuggingFace account required.
|
|
189
|
+
*/
|
|
190
|
+
export const GEMMA_4_E4B_IT =
|
|
191
|
+
"https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm";
|
|
@@ -30,18 +30,18 @@ export interface LLMConfig {
|
|
|
30
30
|
|
|
31
31
|
/**
|
|
32
32
|
* Primary compute backend for text generation.
|
|
33
|
-
* - 'cpu': CPU inference (
|
|
34
|
-
* - 'gpu': GPU acceleration (fast,
|
|
33
|
+
* - 'cpu': CPU inference (safe default, always available)
|
|
34
|
+
* - 'gpu': GPU acceleration (fast, Metal on iOS, GPU delegate on Android)
|
|
35
35
|
* - 'npu': NPU/Neural Engine (fastest on supported devices)
|
|
36
36
|
*
|
|
37
|
-
* If not specified, defaults to '
|
|
37
|
+
* If not specified, defaults to 'cpu'.
|
|
38
38
|
* If specified backend is unavailable, falls back automatically.
|
|
39
39
|
*
|
|
40
40
|
* @remarks
|
|
41
|
-
* Vision encoder is always set to GPU (required by Gemma
|
|
41
|
+
* Vision encoder is always set to GPU (required by Gemma models).
|
|
42
42
|
* Audio encoder is always set to CPU (optimal for audio processing).
|
|
43
43
|
*
|
|
44
|
-
* @default '
|
|
44
|
+
* @default 'cpu'
|
|
45
45
|
*/
|
|
46
46
|
backend?: Backend;
|
|
47
47
|
|
|
@@ -116,12 +116,12 @@ export interface MemoryUsage {
|
|
|
116
116
|
|
|
117
117
|
/**
|
|
118
118
|
* LiteRT-LM: High-performance LLM inference engine.
|
|
119
|
-
* Supports Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
119
|
+
* Supports Gemma 4, Gemma 3n, Phi-4, Qwen, and other .litertlm models.
|
|
120
120
|
*
|
|
121
121
|
* @example
|
|
122
122
|
* ```typescript
|
|
123
123
|
* const llm = createLLM();
|
|
124
|
-
* llm.loadModel('/path/to/gemma-
|
|
124
|
+
* llm.loadModel('/path/to/gemma-4-E2B-it.litertlm', { backend: 'cpu' });
|
|
125
125
|
*
|
|
126
126
|
* // Blocking generation
|
|
127
127
|
* const response = llm.sendMessage('What is the capital of France?');
|