react-native-litert-lm 0.1.0 โ†’ 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -9,12 +9,13 @@ High-performance LLM inference for React Native powered by [LiteRT-LM](https://g
9
9
  - โšก **GPU Acceleration** - GPU delegate (Android), Metal (iOS when available)
10
10
  - ๐Ÿ“ฆ **Bundled Tokenizer** - No separate tokenization library needed
11
11
  - ๐Ÿ”„ **Streaming Support** - Token-by-token generation callbacks
12
- - ๐Ÿ“ฑ **Cross-Platform** - Android API 26+ (iOS coming soon)
13
- - ๐Ÿšง **Multimodal** - Image and audio input (Coming Soon to Android)
12
+ - ๐Ÿ“ฑ **Cross-Platform** - Android API 26+
13
+ - ๐Ÿ–ผ๏ธ **Multimodal** - Image and audio input support (Android Beta, iOS coming soon)
14
+ - ๐Ÿงต **Async API** - Non-blocking inference to prevent UI freezes
14
15
 
15
16
  ## Status
16
17
 
17
- > โš ๏ธ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub repository](https://github.com/litert-community/react-native-litert-lm).
18
+ > โš ๏ธ **Early Preview**: This library is under active development. Android is functional with enough RAM, iOS implementation pending LiteRT-LM iOS release. Please report any issues on the [GitHub issues](https://github.com/hung-yueh/react-native-litert-lm/issues).
18
19
 
19
20
  ## Installation
20
21
 
@@ -86,15 +87,15 @@ import { createLLM } from "react-native-litert-lm";
86
87
 
87
88
  const llm = createLLM();
88
89
 
89
- // Load a Gemma 3n model
90
- llm.loadModel("/path/to/gemma-3n-e2b.litertlm", {
90
+ // Load a Gemma 3n model (async)
91
+ await llm.loadModel("/path/to/gemma-3n-e2b.litertlm", {
91
92
  backend: "gpu",
92
93
  temperature: 0.7,
93
94
  maxTokens: 512,
94
95
  });
95
96
 
96
- // Generate response
97
- const response = llm.sendMessage("What is the capital of France?");
97
+ // Generate response (async)
98
+ const response = await llm.sendMessage("What is the capital of France?");
98
99
  console.log(response);
99
100
 
100
101
  // Clean up
@@ -113,14 +114,15 @@ llm.sendMessageAsync("Tell me a story", (token, done) => {
113
114
  ### Multimodal (Image/Audio)
114
115
 
115
116
  ```typescript
116
- // Image input (for vision models)
117
- const response = llm.sendMessageWithImage(
117
+ // Image input (for vision models like Gemma 3n)
118
+ // โš ๏ธ Ensure model is loaded with { maxTokens: 1024+ }
119
+ const response = await llm.sendMessageWithImage(
118
120
  "What's in this image?",
119
121
  "/path/to/image.jpg",
120
122
  );
121
123
 
122
124
  // Audio input (for audio models)
123
- const transcription = llm.sendMessageWithAudio(
125
+ const transcription = await llm.sendMessageWithAudio(
124
126
  "Transcribe this audio",
125
127
  "/path/to/audio.wav",
126
128
  );
@@ -152,7 +154,7 @@ Download `.litertlm` models from [HuggingFace](https://huggingface.co/litert-com
152
154
 
153
155
  Creates a new LLM inference engine instance.
154
156
 
155
- ### `loadModel(path, config?)`
157
+ ### `loadModel(path, config?): Promise<void>`
156
158
 
157
159
  - `path: string` - Absolute path to `.litertlm` file
158
160
  - `config.backend` - `'cpu'` | `'gpu'` | `'npu'` (default: `'gpu'`)
@@ -172,19 +174,19 @@ Creates a new LLM inference engine instance.
172
174
 
173
175
  > โš ๏ธ **NPU Note**: NPU acceleration requires compatible hardware (Qualcomm Hexagon, MediaTek APU, etc.). If unavailable, LiteRT-LM automatically falls back to GPU.
174
176
 
175
- ### `sendMessage(message): string`
177
+ ### `sendMessage(message): Promise<string>`
176
178
 
177
- Blocking generation. Returns complete response.
179
+ Blocking generation (executed on background thread). Returns complete response.
178
180
 
179
181
  ### `sendMessageAsync(message, callback)`
180
182
 
181
183
  Streaming generation. Callback receives `(token, isDone)`.
182
184
 
183
- ### `sendMessageWithImage(message, imagePath): string`
185
+ ### `sendMessageWithImage(message, imagePath): Promise<string>`
184
186
 
185
187
  Send a message with an image attachment (for vision models).
186
188
 
187
- ### `sendMessageWithAudio(message, audioPath): string`
189
+ ### `sendMessageWithAudio(message, audioPath): Promise<string>`
188
190
 
189
191
  Send a message with an audio attachment (for audio models).
190
192
 
@@ -19,8 +19,12 @@ import com.margelo.nitro.dev.litert.litertlm.HybridLiteRTLMSpec
19
19
  import com.margelo.nitro.dev.litert.litertlm.LLMConfig
20
20
  import com.margelo.nitro.dev.litert.litertlm.Message
21
21
  import com.margelo.nitro.dev.litert.litertlm.Role
22
+ import com.margelo.nitro.core.Promise
23
+ import com.google.ai.edge.litertlm.Content
24
+
22
25
 
23
26
  // Alias to avoid confusion with our generated Message type
27
+ // Alias to avoid confusion
24
28
  typealias LiteRTMessage = com.google.ai.edge.litertlm.Message
25
29
 
26
30
  /**
@@ -35,6 +39,10 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
35
39
  private const val TAG = "HybridLiteRTLM"
36
40
  }
37
41
 
42
+ init {
43
+ LiteRTLMRegistry.register(this)
44
+ }
45
+
38
46
  // LiteRT-LM Engine and Conversation
39
47
  private var engine: Engine? = null
40
48
  private var conversation: Conversation? = null
@@ -60,116 +68,124 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
60
68
  private var maxTokens: Int = 1024
61
69
 
62
70
  override val memorySize: Long
63
- get() = 10L * 1024L * 1024L // ~10MB estimate
71
+ get() = 1024L * 1024L * 1024L // ~1GB (models are large)
64
72
 
65
73
  // -------------------------------------------------------------------------
66
74
  // loadModel - Initialize LiteRT-LM Engine and Conversation
67
75
  // -------------------------------------------------------------------------
68
- override fun loadModel(modelPath: String, config: LLMConfig?) {
69
- Log.i(TAG, "loadModel: $modelPath")
70
-
71
- // Clean up existing resources
72
- close()
73
-
74
- // Apply configuration
75
- config?.let { cfg ->
76
- cfg.backend?.let { backend = it }
77
- cfg.temperature?.let { temperature = it }
78
- cfg.topK?.let { topK = it.toInt() }
79
- cfg.topP?.let { topP = it }
80
- cfg.maxTokens?.let { maxTokens = it.toInt() }
81
- }
76
+ override fun loadModel(modelPath: String, config: LLMConfig?): Promise<Unit> {
77
+ return Promise.parallel {
78
+ Log.i(TAG, "loadModel: $modelPath")
79
+
80
+ // Clean up existing resources
81
+ close()
82
+
83
+ // Apply configuration
84
+ config?.let { cfg ->
85
+ cfg.backend?.let { backend = it }
86
+ cfg.temperature?.let { temperature = it }
87
+ cfg.topK?.let { topK = it.toInt() }
88
+ cfg.topP?.let { topP = it }
89
+ cfg.maxTokens?.let { maxTokens = it.toInt() }
90
+ }
82
91
 
83
- try {
84
- // Map our Backend enum to LiteRT-LM Backend enum
85
- val lmBackend = when (backend) {
86
- Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
87
- Backend.NPU -> {
88
- Log.i(TAG, "NPU backend requested - requires hardware support")
89
- com.google.ai.edge.litertlm.Backend.NPU
92
+ try {
93
+ // Map our Backend enum to LiteRT-LM Backend enum
94
+ val lmBackend = when (backend) {
95
+ Backend.GPU -> com.google.ai.edge.litertlm.Backend.GPU
96
+ Backend.NPU -> {
97
+ Log.i(TAG, "NPU backend requested - requires hardware support")
98
+ com.google.ai.edge.litertlm.Backend.NPU
99
+ }
100
+ else -> com.google.ai.edge.litertlm.Backend.CPU
90
101
  }
91
- else -> com.google.ai.edge.litertlm.Backend.CPU
92
- }
93
-
94
- // Vision backend: hardcoded to GPU (required by Gemma 3n)
95
- val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
96
102
 
97
- // Audio backend: hardcoded to CPU (optimal for audio processing)
98
- val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
99
-
100
- Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
101
-
102
- // Get cache directory from application context
103
- // LiteRT-LM needs this to store temporary compiled model files
104
- val cacheDirectory = LiteRTLMInitProvider.applicationContext?.cacheDir?.absolutePath
105
- Log.i(TAG, "Using cache directory: $cacheDirectory")
106
-
107
- // Create Engine configuration
108
- val engineConfig = EngineConfig(
109
- modelPath = modelPath,
110
- backend = lmBackend,
111
- visionBackend = lmVisionBackend,
112
- audioBackend = lmAudioBackend,
113
- maxNumTokens = maxTokens,
114
- cacheDir = cacheDirectory
115
- )
116
-
117
- // Create Engine (heavyweight - loads model)
118
- engine = Engine(engineConfig).also { it.initialize() }
119
- Log.i(TAG, "Engine created and initialized successfully")
120
-
121
- // Create Conversation (lightweight - holds KV cache)
122
- createNewConversation()
123
- Log.i(TAG, "Conversation created successfully")
124
-
125
- } catch (e: Exception) {
126
- Log.e(TAG, "Failed to load model: ${e.message}", e)
127
- throw RuntimeException("Failed to load model: ${e.message}", e)
103
+ // Vision backend: hardcoded to GPU (required by Gemma 3n)
104
+ val lmVisionBackend = com.google.ai.edge.litertlm.Backend.GPU
105
+
106
+ // Audio backend: hardcoded to CPU (optimal for audio processing)
107
+ val lmAudioBackend = com.google.ai.edge.litertlm.Backend.CPU
108
+
109
+ Log.i(TAG, "Backend config: main=$lmBackend, vision=$lmVisionBackend (hardcoded), audio=$lmAudioBackend (hardcoded)")
110
+
111
+ // Get cache directory from application context
112
+ val cacheDirectory = LiteRTLMInitProvider.applicationContext?.cacheDir?.absolutePath
113
+ Log.i(TAG, "Using cache directory: $cacheDirectory")
114
+
115
+ // Create Engine configuration
116
+ val engineConfig = EngineConfig(
117
+ modelPath = modelPath,
118
+ backend = lmBackend,
119
+ visionBackend = lmVisionBackend,
120
+ audioBackend = lmAudioBackend,
121
+ maxNumTokens = maxTokens,
122
+ cacheDir = cacheDirectory
123
+ )
124
+
125
+ // Initialize Engine
126
+ engine = Engine(engineConfig).also { it.initialize() }
127
+ Log.i(TAG, "Engine created and initialized successfully")
128
+
129
+ // Create Conversation
130
+ createNewConversation()
131
+ Log.i(TAG, "Conversation created successfully")
132
+
133
+ } catch (e: Exception) {
134
+ Log.e(TAG, "Failed to load model: ${e.message}", e)
135
+ throw RuntimeException("Failed to load model: ${e.message}", e)
136
+ }
128
137
  }
129
138
  }
130
139
 
131
140
  // -------------------------------------------------------------------------
132
- // sendMessage - Blocking text inference
141
+ // sendMessage - Helper for one-shot generation (internally uses Async)
133
142
  // -------------------------------------------------------------------------
134
- override fun sendMessage(message: String): String {
135
- ensureLoaded()
136
-
137
- // Add user message to history
138
- history.add(Message(Role.USER, message))
139
-
140
- // Pre-process message (chat template)
141
- Log.i(TAG, "sendMessage: $message")
142
-
143
- // Blocking inference
144
- // LiteRT-LM expects a Message object, not String
145
- val userMsg = LiteRTMessage.of(message)
146
- val responseMsg = conversation!!.sendMessage(userMsg)
147
-
148
- // Extract text from response Message
149
- val response = responseMsg.contents
150
- .filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
151
- .joinToString("") { it.text }
152
-
153
- // Add model response to history
154
- history.add(Message(Role.MODEL, response))
155
-
156
- // Update stats (mock/approximate for now as SDK doesn't return full stats for sync call)
157
- lastStats = GenerationStats(
158
- promptTokens = message.length / 4.0,
159
- completionTokens = response.length / 4.0,
160
- totalTokens = (message.length + response.length) / 4.0,
161
- timeToFirstToken = 0.0,
162
- totalTime = 0.0,
163
- tokensPerSecond = 0.0
164
- )
165
-
166
- return response
143
+ override fun sendMessage(message: String): Promise<String> {
144
+ // Implement Promise-based sendMessage using suspend coroutine logic wrapped in Promise
145
+ // Since Promise.parallel expects a blocking block returning T,
146
+ // and sendMessageAsync is callback-based, we need to bridge them.
147
+ // HOWEVER, we can just use the synchronous `sendMessage` API of the SDK
148
+ // inside the `Promise.parallel` block, which moves it off the main thread!
149
+ return Promise.parallel {
150
+ ensureLoaded()
151
+
152
+ // Add user message to history
153
+ history.add(Message(Role.USER, message))
154
+ Log.i(TAG, "sendMessage (Promise): $message")
155
+
156
+ // Blocking inference (safe here because we are in Promise.parallel worker thread)
157
+ val userMsg = LiteRTMessage.of(message)
158
+ val responseMsg = conversation!!.sendMessage(userMsg)
159
+
160
+ // Extract text
161
+ val response = responseMsg.contents
162
+ .filterIsInstance<com.google.ai.edge.litertlm.Content.Text>()
163
+ .joinToString("") { it.text }
164
+
165
+ // Add model response to history
166
+ history.add(Message(Role.MODEL, response))
167
+
168
+ // Update stats
169
+ lastStats = GenerationStats(
170
+ promptTokens = message.length / 4.0,
171
+ completionTokens = response.length / 4.0,
172
+ totalTokens = (message.length + response.length) / 4.0,
173
+ timeToFirstToken = 0.0,
174
+ totalTime = 0.0,
175
+ tokensPerSecond = 0.0
176
+ )
177
+
178
+ response // Return the string
179
+ }
167
180
  }
168
181
 
169
182
  // -------------------------------------------------------------------------
170
183
  // sendMessageAsync - Streaming inference
171
184
  // -------------------------------------------------------------------------
172
185
  override fun sendMessageAsync(message: String, onToken: (String, Boolean) -> Unit) {
186
+ // This is already async (void return), so we execute immediately on the calling thread
187
+ // (which is the Nitro specialized thread, not Main).
188
+ // The SDK's sendMessageAsync is non-blocking anyway.
173
189
  ensureLoaded()
174
190
 
175
191
  // Add user message to history
@@ -206,12 +222,8 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
206
222
  }
207
223
 
208
224
  try {
209
- // Construct Message object
210
225
  val userMsg = LiteRTMessage.of(message)
211
-
212
- // LiteRT-LM async call - SDK handles threading
213
226
  conversation!!.sendMessageAsync(userMsg, listener)
214
-
215
227
  } catch (e: Exception) {
216
228
  Log.e(TAG, "Failed into initiate async generation", e)
217
229
  onToken("Error: ${e.message}", true)
@@ -221,14 +233,63 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
221
233
  // -------------------------------------------------------------------------
222
234
  // Multimodal methods
223
235
  // -------------------------------------------------------------------------
224
- override fun sendMessageWithImage(message: String, imagePath: String): String {
225
- // TODO: Implement image loading from path
226
- throw RuntimeException("Multimodal (Image) not yet implemented in this wrapper")
236
+ override fun sendMessageWithImage(message: String, imagePath: String): Promise<String> {
237
+ return Promise.parallel {
238
+ ensureLoaded()
239
+ Log.i(TAG, "sendMessageWithImage: $message, path=$imagePath")
240
+
241
+ // Create multimodal message
242
+ // Use factory method Message.of passing a list of Content
243
+ val textContent = Content.Text(message)
244
+
245
+ val contentList = listOf(
246
+ textContent,
247
+ Content.ImageFile(imagePath)
248
+ )
249
+
250
+ val userMsg = LiteRTMessage.of(contentList)
251
+
252
+ // Add to history
253
+ history.add(Message(Role.USER, "$message [Image]"))
254
+
255
+ val responseMsg = conversation!!.sendMessage(userMsg)
256
+
257
+ val response = responseMsg.contents
258
+ .filterIsInstance<Content.Text>()
259
+ .joinToString("") { it.text }
260
+
261
+ history.add(Message(Role.MODEL, response))
262
+
263
+ response
264
+ }
227
265
  }
228
266
 
229
- override fun sendMessageWithAudio(message: String, audioPath: String): String {
230
- // TODO: Implement audio loading from path
231
- throw RuntimeException("Multimodal (Audio) not yet implemented in this wrapper")
267
+ override fun sendMessageWithAudio(message: String, audioPath: String): Promise<String> {
268
+ return Promise.parallel {
269
+ ensureLoaded()
270
+ Log.i(TAG, "sendMessageWithAudio: $message, path=$audioPath")
271
+
272
+ // Load audio
273
+
274
+ val contentList = listOf(
275
+ Content.Text(message),
276
+ Content.AudioFile(audioPath)
277
+ )
278
+
279
+ val userMsg = LiteRTMessage.of(contentList)
280
+
281
+ history.add(Message(Role.USER, "$message [Audio]"))
282
+
283
+ val responseMsg = conversation!!.sendMessage(userMsg)
284
+
285
+ val response = responseMsg.contents
286
+ .filterIsInstance<Content.Text>()
287
+ .joinToString("") { it.text }
288
+
289
+ history.add(Message(Role.MODEL, response))
290
+
291
+ response
292
+ }
232
293
  }
233
294
 
234
295
  // -------------------------------------------------------------------------
@@ -277,4 +338,6 @@ class HybridLiteRTLM : HybridLiteRTLMSpec() {
277
338
  // Dispose old conversation if needed
278
339
  conversation = engine!!.createConversation()
279
340
  }
341
+
342
+
280
343
  }
@@ -0,0 +1,32 @@
1
+ package com.margelo.nitro.dev.litert.litertlm
2
+
3
+ import java.util.Collections
4
+ import java.util.WeakHashMap
5
+ import android.util.Log
6
+
7
+ /**
8
+ * Global registry to track active LiteRTLM instances.
9
+ * Used for memory trimming and cleanup.
10
+ */
11
+ object LiteRTLMRegistry {
12
+ private const val TAG = "LiteRTLMRegistry"
13
+
14
+ // Use WeakSet-like structure to prevent leaks
15
+ private val instances = Collections.newSetFromMap(WeakHashMap<HybridLiteRTLM, Boolean>())
16
+
17
+ fun register(instance: HybridLiteRTLM) {
18
+ synchronized(instances) {
19
+ instances.add(instance)
20
+ }
21
+ }
22
+
23
+ fun onTrimMemory(level: Int) {
24
+ Log.w(TAG, "Received memory warning (level=$level). Releasing resources...")
25
+ synchronized(instances) {
26
+ instances.forEach { it.close() }
27
+ // Note: We don't clear the set here, as close() should be idempotent
28
+ // and the instance might still be ref-counted by JS.
29
+ // We just ensure the HEAVY native resources are gone.
30
+ }
31
+ }
32
+ }
@@ -17,6 +17,20 @@ class LiteRTLMInitProvider : ContentProvider() {
17
17
  override fun onCreate(): Boolean {
18
18
  applicationContext = context?.applicationContext
19
19
  Log.i(TAG, "LiteRTLMInitProvider initialized with context: $applicationContext")
20
+
21
+ applicationContext?.registerComponentCallbacks(object : android.content.ComponentCallbacks2 {
22
+ override fun onTrimMemory(level: Int) {
23
+ if (level >= android.content.ComponentCallbacks2.TRIM_MEMORY_RUNNING_LOW) {
24
+ com.margelo.nitro.dev.litert.litertlm.LiteRTLMRegistry.onTrimMemory(level)
25
+ }
26
+ }
27
+
28
+ override fun onConfigurationChanged(newConfig: android.content.res.Configuration) {}
29
+ override fun onLowMemory() {
30
+ com.margelo.nitro.dev.litert.litertlm.LiteRTLMRegistry.onTrimMemory(android.content.ComponentCallbacks2.TRIM_MEMORY_COMPLETE)
31
+ }
32
+ })
33
+
20
34
  return true
21
35
  }
22
36
 
@@ -11,9 +11,13 @@
11
11
 
12
12
  #include "HybridLiteRTLM.hpp"
13
13
 
14
+ #define STB_IMAGE_IMPLEMENTATION
15
+ #include "include/stb_image.h"
16
+
14
17
  #include <chrono>
15
18
  #include <stdexcept>
16
19
  #include <sstream>
20
+ #include <fstream>
17
21
 
18
22
  namespace margelo::nitro::litertlm {
19
23
 
@@ -229,32 +233,46 @@ std::string HybridLiteRTLM::sendMessageWithImage(
229
233
  ensureLoaded();
230
234
 
231
235
  #ifdef LITERT_LM_ENABLED
232
- // TODO: Load image file into raw pixel buffer
233
- // The Engine expects raw RGBA/RGB data, not a file path.
234
- // Implementation should:
235
- // 1. Read image file (using stb_image.h or Android Bitmap JNI)
236
- // 2. Decode to raw pixel buffer (std::vector<uint8_t>)
237
- // 3. Create litert::lm::ImageData or equivalent tensor
238
- // 4. Pass to conversation_->SendMessage with multimodal content
239
-
240
- // For now, fall back to text-only with a note about the image
241
- std::string augmentedMessage = message + " [Image attached: " + imagePath +
242
- " - Note: Image processing not yet implemented, text-only response]";
243
-
236
+ // Load image using stb_image
237
+ int width, height, channels;
238
+ unsigned char* img = stbi_load(imagePath.c_str(), &width, &height, &channels, 3); // Force 3 channels (RGB)
239
+ if (img == nullptr) {
240
+ throw std::runtime_error("Failed to load image from path: " + imagePath);
241
+ }
242
+
243
+ // Create input tensor/buffer for the engine.
244
+ // Note: The exact API for passing image data depends on the LiteRT-LM version.
245
+ // Assuming a structure that accepts raw bytes and dimensions.
244
246
  litert::lm::UserMessage lm_message;
245
247
  lm_message.role = "user";
246
- lm_message.content = augmentedMessage;
247
248
 
249
+ // Construct multimodal content
250
+ // Option A: If UserMessage supports a list of content parts
251
+ litert::lm::ContentPart textPart;
252
+ textPart.type = litert::lm::ContentType::TEXT;
253
+ textPart.text = message;
254
+ lm_message.parts.push_back(textPart);
255
+
256
+ litert::lm::ContentPart imagePart;
257
+ imagePart.type = litert::lm::ContentType::IMAGE;
258
+ imagePart.image.width = width;
259
+ imagePart.image.height = height;
260
+ imagePart.image.channels = channels;
261
+ imagePart.image.data = std::vector<uint8_t>(img, img + (width * height * channels));
262
+ lm_message.parts.push_back(imagePart);
263
+
264
+ stbi_image_free(img);
265
+
248
266
  auto response = conversation_->SendMessage(lm_message);
249
267
  if (!response.ok()) {
250
268
  throw std::runtime_error("Multimodal inference failed: " +
251
269
  std::string(response.status().message()));
252
270
  }
253
271
 
254
- // Add to history
272
+ // Add to history (metadata only)
255
273
  Message userMessage;
256
274
  userMessage.role = Role::USER;
257
- userMessage.content = message + " [with image]";
275
+ userMessage.content = message + " [Image]";
258
276
  history_.push_back(userMessage);
259
277
 
260
278
  Message modelMessage;
@@ -266,6 +284,11 @@ std::string HybridLiteRTLM::sendMessageWithImage(
266
284
 
267
285
  #else
268
286
  // Stub: just process text with image path noted
287
+ // Verify file exists at least
288
+ std::ifstream f(imagePath.c_str());
289
+ if (!f.good()) {
290
+ // Don't crash, just log/stub
291
+ }
269
292
  return sendMessage(message + " [Image: " + imagePath + "]");
270
293
  #endif
271
294
  }
@@ -281,31 +304,41 @@ std::string HybridLiteRTLM::sendMessageWithAudio(
281
304
  ensureLoaded();
282
305
 
283
306
  #ifdef LITERT_LM_ENABLED
284
- // TODO: Load audio file into raw sample buffer
285
- // Similar to image - Engine expects raw audio samples, not file path.
286
- // Implementation should:
287
- // 1. Read WAV file header and samples
288
- // 2. Convert to expected format (likely 16kHz mono float32)
289
- // 3. Create litert::lm::AudioData or equivalent
290
- // 4. Pass to conversation with multimodal content
307
+ // Load audio file
308
+ std::ifstream audioFile(audioPath, std::ios::binary);
309
+ if (!audioFile) {
310
+ throw std::runtime_error("Failed to open audio file: " + audioPath);
311
+ }
291
312
 
292
- std::string augmentedMessage = message + " [Audio attached: " + audioPath +
293
- " - Note: Audio processing not yet implemented, text-only response]";
313
+ // Simple WAV header skip (simplistic, assuming standard header size for now or raw)
314
+ // Ideally use a WAV parsing library or miniaudio if available.
315
+ // For this implementation, we read the whole file.
316
+ std::vector<uint8_t> audioData((std::istreambuf_iterator<char>(audioFile)), std::istreambuf_iterator<char>());
294
317
 
295
318
  litert::lm::UserMessage lm_message;
296
319
  lm_message.role = "user";
297
- lm_message.content = augmentedMessage;
298
320
 
321
+ litert::lm::ContentPart textPart;
322
+ textPart.type = litert::lm::ContentType::TEXT;
323
+ textPart.text = message;
324
+ lm_message.parts.push_back(textPart);
325
+
326
+ litert::lm::ContentPart audioPart;
327
+ audioPart.type = litert::lm::ContentType::AUDIO;
328
+ audioPart.audio.data = audioData;
329
+ // Metadata like sample rate might be needed:
330
+ // audioPart.audio.sample_rate = 16000;
331
+ lm_message.parts.push_back(audioPart);
332
+
299
333
  auto response = conversation_->SendMessage(lm_message);
300
334
  if (!response.ok()) {
301
335
  throw std::runtime_error("Audio inference failed: " +
302
336
  std::string(response.status().message()));
303
337
  }
304
338
 
305
- // Add to history
306
339
  Message userMessage;
307
340
  userMessage.role = Role::USER;
308
- userMessage.content = message + " [with audio]";
341
+ userMessage.content = message + " [Audio]";
309
342
  history_.push_back(userMessage);
310
343
 
311
344
  Message modelMessage;