react-native-litert-lm 0.1.1 โ†’ 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,12 +10,12 @@ High-performance LLM inference for React Native powered by [LiteRT-LM](https://g
10
10
  - ๐Ÿ“ฆ **Bundled Tokenizer** - No separate tokenization library needed
11
11
  - ๐Ÿ”„ **Streaming Support** - Token-by-token generation callbacks
12
12
  - ๐Ÿ“ฑ **Cross-Platform** - Android API 26+
13
- - ๐Ÿšง **Multimodal** - Image and audio input (Coming Soon)
13
+ - ๐Ÿ–ผ๏ธ **Multimodal** - Image and audio input support (Android Beta, iOS coming soon)
14
14
  - ๐Ÿงต **Async API** - Non-blocking inference to prevent UI freezes
15
15
 
16
16
  ## Status
17
17
 
18
- > โš ๏ธ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub repository](https://github.com/litert-community/react-native-litert-lm).
18
+ > โš ๏ธ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub issues](https://github.com/hung-yueh/react-native-litert-lm/issues).
19
19
 
20
20
  ## Installation
21
21
 
@@ -114,15 +114,14 @@ llm.sendMessageAsync("Tell me a story", (token, done) => {
114
114
  ### Multimodal (Image/Audio)
115
115
 
116
116
  ```typescript
117
- // Image input (for vision models)
118
- // Note: Currently throws error on Android (Coming Soon)
117
+ // Image input (for vision models like Gemma 3n)
118
+ // โš ๏ธ Ensure model is loaded with { maxTokens: 1024+ }
119
119
  const response = await llm.sendMessageWithImage(
120
120
  "What's in this image?",
121
121
  "/path/to/image.jpg",
122
122
  );
123
123
 
124
124
  // Audio input (for audio models)
125
- // Note: Currently throws error on Android (Coming Soon)
126
125
  const transcription = await llm.sendMessageWithAudio(
127
126
  "Transcribe this audio",
128
127
  "/path/to/audio.wav",
@@ -20,8 +20,11 @@ import com.margelo.nitro.dev.litert.litertlm.LLMConfig
20
20
  import com.margelo.nitro.dev.litert.litertlm.Message
21
21
  import com.margelo.nitro.dev.litert.litertlm.Role
22
22
  import com.margelo.nitro.core.Promise
23
+ import com.google.ai.edge.litertlm.Content
24
+
23
25
 
24
26
  // Alias to avoid confusion with our generated Message type
27
+ // Alias to avoid confusion
25
28
  typealias LiteRTMessage = com.google.ai.edge.litertlm.Message
26
29
 
27
30
  /**
@@ -232,15 +235,60 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
232
235
  // -------------------------------------------------------------------------
233
236
  override fun sendMessageWithImage(message: String, imagePath: String): Promise<String> {
234
237
  return Promise.parallel {
235
- // TODO: Implement image loading from path
236
- throw RuntimeException("Multimodal (Image) not yet implemented in this wrapper")
238
+ ensureLoaded()
239
+ Log.i(TAG, "sendMessageWithImage: $message, path=$imagePath")
240
+
241
+ // Create multimodal message
242
+ // Use factory method Message.of passing a list of Content
243
+ val textContent = Content.Text(message)
244
+
245
+ val contentList = listOf(
246
+ textContent,
247
+ Content.ImageFile(imagePath)
248
+ )
249
+
250
+ val userMsg = LiteRTMessage.of(contentList)
251
+
252
+ // Add to history
253
+ history.add(Message(Role.USER, "$message [Image]"))
254
+
255
+ val responseMsg = conversation!!.sendMessage(userMsg)
256
+
257
+ val response = responseMsg.contents
258
+ .filterIsInstance<Content.Text>()
259
+ .joinToString("") { it.text }
260
+
261
+ history.add(Message(Role.MODEL, response))
262
+
263
+ response
237
264
  }
238
265
  }
239
266
 
240
267
  override fun sendMessageWithAudio(message: String, audioPath: String): Promise<String> {
241
268
  return Promise.parallel {
242
- // TODO: Implement audio loading from path
243
- throw RuntimeException("Multimodal (Audio) not yet implemented in this wrapper")
269
+ ensureLoaded()
270
+ Log.i(TAG, "sendMessageWithAudio: $message, path=$audioPath")
271
+
272
+ // Load audio
273
+
274
+ val contentList = listOf(
275
+ Content.Text(message),
276
+ Content.AudioFile(audioPath)
277
+ )
278
+
279
+ val userMsg = LiteRTMessage.of(contentList)
280
+
281
+ history.add(Message(Role.USER, "$message [Audio]"))
282
+
283
+ val responseMsg = conversation!!.sendMessage(userMsg)
284
+
285
+ val response = responseMsg.contents
286
+ .filterIsInstance<Content.Text>()
287
+ .joinToString("") { it.text }
288
+
289
+ history.add(Message(Role.MODEL, response))
290
+
291
+ response
244
292
  }
245
293
  }
246
294
 
@@ -290,4 +338,6 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
290
338
  // Dispose old conversation if needed
291
339
  conversation = engine!!.createConversation()
292
340
  }
341
+
342
+
293
343
  }
@@ -11,9 +11,13 @@
11
11
 
12
12
  #include "HybridLiteRTLM.hpp"
13
13
 
14
+ #define STB_IMAGE_IMPLEMENTATION
15
+ #include "include/stb_image.h"
16
+
14
17
  #include <chrono>
15
18
  #include <stdexcept>
16
19
  #include <sstream>
20
+ #include <fstream>
17
21
 
18
22
  namespace margelo::nitro::litertlm {
19
23
 
@@ -229,32 +233,46 @@ std::string HybridLiteRTLM::sendMessageWithImage(
229
233
  ensureLoaded();
230
234
 
231
235
  #ifdef LITERT_LM_ENABLED
232
- // TODO: Load image file into raw pixel buffer
233
- // The Engine expects raw RGBA/RGB data, not a file path.
234
- // Implementation should:
235
- // 1. Read image file (using stb_image.h or Android Bitmap JNI)
236
- // 2. Decode to raw pixel buffer (std::vector<uint8_t>)
237
- // 3. Create litert::lm::ImageData or equivalent tensor
238
- // 4. Pass to conversation_->SendMessage with multimodal content
239
-
240
- // For now, fall back to text-only with a note about the image
241
- std::string augmentedMessage = message + " [Image attached: " + imagePath +
242
- " - Note: Image processing not yet implemented, text-only response]";
243
-
236
+ // Load image using stb_image
237
+ int width, height, channels;
238
+ unsigned char* img = stbi_load(imagePath.c_str(), &width, &height, &channels, 3); // Force 3 channels (RGB)
239
+ if (img == nullptr) {
240
+ throw std::runtime_error("Failed to load image from path: " + imagePath);
241
+ }
242
+
243
+ // Create input tensor/buffer for the engine.
244
+ // Note: The exact API for passing image data depends on the LiteRT-LM version.
245
+ // Assuming a structure that accepts raw bytes and dimensions.
244
246
  litert::lm::UserMessage lm_message;
245
247
  lm_message.role = "user";
246
- lm_message.content = augmentedMessage;
247
248
 
249
+ // Construct multimodal content
250
+ // Option A: If UserMessage supports a list of content parts
251
+ litert::lm::ContentPart textPart;
252
+ textPart.type = litert::lm::ContentType::TEXT;
253
+ textPart.text = message;
254
+ lm_message.parts.push_back(textPart);
255
+
256
+ litert::lm::ContentPart imagePart;
257
+ imagePart.type = litert::lm::ContentType::IMAGE;
258
+ imagePart.image.width = width;
259
+ imagePart.image.height = height;
260
+ imagePart.image.channels = channels;
261
+ imagePart.image.data = std::vector<uint8_t>(img, img + (width * height * channels));
262
+ lm_message.parts.push_back(imagePart);
263
+
264
+ stbi_image_free(img);
265
+
248
266
  auto response = conversation_->SendMessage(lm_message);
249
267
  if (!response.ok()) {
250
268
  throw std::runtime_error("Multimodal inference failed: " +
251
269
  std::string(response.status().message()));
252
270
  }
253
271
 
254
- // Add to history
272
+ // Add to history (metadata only)
255
273
  Message userMessage;
256
274
  userMessage.role = Role::USER;
257
- userMessage.content = message + " [with image]";
275
+ userMessage.content = message + " [Image]";
258
276
  history_.push_back(userMessage);
259
277
 
260
278
  Message modelMessage;
@@ -266,6 +284,11 @@ std::string HybridLiteRTLM::sendMessageWithImage(
266
284
 
267
285
  #else
268
286
  // Stub: just process text with image path noted
287
+ // Verify file exists at least
288
+ std::ifstream f(imagePath.c_str());
289
+ if (!f.good()) {
290
+ // Don't crash, just log/stub
291
+ }
269
292
  return sendMessage(message + " [Image: " + imagePath + "]");
270
293
  #endif
271
294
  }
@@ -281,31 +304,41 @@ std::string HybridLiteRTLM::sendMessageWithAudio(
281
304
  ensureLoaded();
282
305
 
283
306
  #ifdef LITERT_LM_ENABLED
284
- // TODO: Load audio file into raw sample buffer
285
- // Similar to image - Engine expects raw audio samples, not file path.
286
- // Implementation should:
287
- // 1. Read WAV file header and samples
288
- // 2. Convert to expected format (likely 16kHz mono float32)
289
- // 3. Create litert::lm::AudioData or equivalent
290
- // 4. Pass to conversation with multimodal content
307
+ // Load audio file
308
+ std::ifstream audioFile(audioPath, std::ios::binary);
309
+ if (!audioFile) {
310
+ throw std::runtime_error("Failed to open audio file: " + audioPath);
311
+ }
291
312
 
292
- std::string augmentedMessage = message + " [Audio attached: " + audioPath +
293
- " - Note: Audio processing not yet implemented, text-only response]";
313
+ // Simple WAV header skip (simplistic, assuming standard header size for now or raw)
314
+ // Ideally use a WAV parsing library or miniaudio if available.
315
+ // For this implementation, we read the whole file.
316
+ std::vector<uint8_t> audioData((std::istreambuf_iterator<char>(audioFile)), std::istreambuf_iterator<char>());
294
317
 
295
318
  litert::lm::UserMessage lm_message;
296
319
  lm_message.role = "user";
297
- lm_message.content = augmentedMessage;
298
320
 
321
+ litert::lm::ContentPart textPart;
322
+ textPart.type = litert::lm::ContentType::TEXT;
323
+ textPart.text = message;
324
+ lm_message.parts.push_back(textPart);
325
+
326
+ litert::lm::ContentPart audioPart;
327
+ audioPart.type = litert::lm::ContentType::AUDIO;
328
+ audioPart.audio.data = audioData;
329
+ // Metadata like sample rate might be needed:
330
+ // audioPart.audio.sample_rate = 16000;
331
+ lm_message.parts.push_back(audioPart);
332
+
299
333
  auto response = conversation_->SendMessage(lm_message);
300
334
  if (!response.ok()) {
301
335
  throw std::runtime_error("Audio inference failed: " +
302
336
  std::string(response.status().message()));
303
337
  }
304
338
 
305
- // Add to history
306
339
  Message userMessage;
307
340
  userMessage.role = Role::USER;
308
- userMessage.content = message + " [with audio]";
341
+ userMessage.content = message + " [Audio]";
309
342
  history_.push_back(userMessage);
310
343
 
311
344
  Message modelMessage;