react-native-litert-lm 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +331 -150
  2. package/android/build.gradle +1 -1
  3. package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +140 -37
  4. package/app.plugin.js +33 -0
  5. package/cpp/HybridLiteRTLM.cpp +577 -378
  6. package/cpp/HybridLiteRTLM.hpp +66 -23
  7. package/cpp/IOSDownloadHelper.h +24 -0
  8. package/cpp/cpp-adapter.cpp +10 -2
  9. package/cpp/include/litert_lm_engine.h +502 -0
  10. package/ios/IOSDownloadHelper.mm +129 -0
  11. package/ios/LiteRTLMAutolinking.mm +30 -0
  12. package/lib/hooks.d.ts +33 -3
  13. package/lib/hooks.js +54 -23
  14. package/lib/index.d.ts +4 -1
  15. package/lib/index.js +6 -6
  16. package/lib/memoryTracker.d.ts +128 -0
  17. package/lib/memoryTracker.js +155 -0
  18. package/lib/modelFactory.d.ts +21 -2
  19. package/lib/modelFactory.js +78 -11
  20. package/lib/specs/LiteRTLM.nitro.d.ts +19 -0
  21. package/nitrogen/generated/android/LiteRTLMOnLoad.cpp +28 -18
  22. package/nitrogen/generated/android/LiteRTLMOnLoad.hpp +13 -4
  23. package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.cpp +39 -36
  24. package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.hpp +20 -22
  25. package/nitrogen/generated/android/c++/JMemoryUsage.hpp +69 -0
  26. package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLMSpec.kt +19 -18
  27. package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/MemoryUsage.kt +47 -0
  28. package/nitrogen/generated/shared/c++/HybridLiteRTLMSpec.cpp +1 -0
  29. package/nitrogen/generated/shared/c++/HybridLiteRTLMSpec.hpp +4 -0
  30. package/nitrogen/generated/shared/c++/MemoryUsage.hpp +95 -0
  31. package/package.json +12 -5
  32. package/react-native-litert-lm.podspec +20 -7
  33. package/scripts/build-ios-engine.sh +283 -0
  34. package/scripts/download-ios-frameworks.sh +72 -0
  35. package/scripts/postinstall.js +116 -0
  36. package/scripts/stubs/cxx_bridge_stubs.cc +224 -0
  37. package/scripts/stubs/gemma_model_constraint_provider.cc +46 -0
  38. package/scripts/stubs/llguidance_stubs.c +101 -0
  39. package/src/hooks.ts +107 -41
  40. package/src/index.ts +13 -6
  41. package/src/memoryTracker.ts +268 -0
  42. package/src/modelFactory.ts +107 -11
  43. package/src/specs/LiteRTLM.nitro.ts +21 -0
@@ -2,7 +2,7 @@
2
2
  // HybridLiteRTLM.cpp
3
3
  // react-native-litert-lm
4
4
  //
5
- // High-performance LLM inference using LiteRT-LM.
5
+ // High-performance LLM inference using LiteRT-LM C API.
6
6
  //
7
7
  // NOTE: This C++ implementation is used for iOS ONLY.
8
8
  // Android uses the Kotlin implementation in `android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt`.
@@ -11,80 +11,202 @@
11
11
 
12
12
  #include "HybridLiteRTLM.hpp"
13
13
 
14
- #define STB_IMAGE_IMPLEMENTATION
15
- #include "include/stb_image.h"
16
14
 
15
+
16
+
17
+ #include <NitroModules/Promise.hpp>
17
18
  #include <chrono>
18
19
  #include <stdexcept>
19
20
  #include <sstream>
21
+
22
+ #ifdef __APPLE__
23
+ #include "IOSDownloadHelper.h"
24
+ #endif
20
25
  #include <fstream>
26
+ #include <thread>
27
+ #include <regex>
21
28
 
22
29
  namespace margelo::nitro::litertlm {
23
30
 
24
- //------------------------------------------------------------------------------
25
- // Helper: Format user prompt (applies chat template if needed)
26
- //------------------------------------------------------------------------------
27
- std::string HybridLiteRTLM::formatUserPrompt(const std::string& message) const {
28
- // The LiteRT-LM Conversation class handles chat templates internally,
29
- // so we just return the message as-is. If we were using Session directly,
30
- // we'd apply the Gemma/Phi template here.
31
- return message;
31
+ // =============================================================================
32
+ // JSON Helpers
33
+ // =============================================================================
34
+
35
+ std::string HybridLiteRTLM::escapeJson(const std::string& input) {
36
+ std::string output;
37
+ output.reserve(input.size() + 16);
38
+ for (char c : input) {
39
+ switch (c) {
40
+ case '"': output += "\\\""; break;
41
+ case '\\': output += "\\\\"; break;
42
+ case '\n': output += "\\n"; break;
43
+ case '\r': output += "\\r"; break;
44
+ case '\t': output += "\\t"; break;
45
+ case '\b': output += "\\b"; break;
46
+ case '\f': output += "\\f"; break;
47
+ default: output += c; break;
48
+ }
49
+ }
50
+ return output;
51
+ }
52
+
53
+ std::string HybridLiteRTLM::buildTextMessageJson(const std::string& text) {
54
+ return "{\"role\":\"user\",\"content\":\"" + escapeJson(text) + "\"}";
55
+ }
56
+
57
+ std::string HybridLiteRTLM::buildImageMessageJson(const std::string& text, const std::string& imagePath) {
58
+ return "{\"role\":\"user\",\"content\":["
59
+ "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
60
+ "{\"type\":\"image\",\"path\":\"" + escapeJson(imagePath) + "\"}"
61
+ "]}";
62
+ }
63
+
64
+ std::string HybridLiteRTLM::buildAudioMessageJson(const std::string& text, const std::string& audioPath) {
65
+ return "{\"role\":\"user\",\"content\":["
66
+ "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
67
+ "{\"type\":\"audio\",\"path\":\"" + escapeJson(audioPath) + "\"}"
68
+ "]}";
69
+ }
70
+
71
+ std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonResponse) {
72
+ // The C API response JSON is structured as:
73
+ // {"role":"model","content":[{"type":"text","text":"..."}]}
74
+ // or:
75
+ // {"role":"model","content":"..."}
76
+ //
77
+ // We use simple string extraction to avoid a JSON library dependency.
78
+
79
+ // Try array format first: find "text":"..." after "type":"text"
80
+ std::string textMarker = "\"text\":\"";
81
+ size_t pos = jsonResponse.find("\"type\":\"text\"");
82
+ if (pos != std::string::npos) {
83
+ pos = jsonResponse.find(textMarker, pos);
84
+ if (pos != std::string::npos) {
85
+ pos += textMarker.length();
86
+ std::string result;
87
+ result.reserve(jsonResponse.size() - pos);
88
+ for (size_t i = pos; i < jsonResponse.size(); i++) {
89
+ if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
90
+ char next = jsonResponse[i + 1];
91
+ if (next == '"') { result += '"'; i++; }
92
+ else if (next == '\\') { result += '\\'; i++; }
93
+ else if (next == 'n') { result += '\n'; i++; }
94
+ else if (next == 'r') { result += '\r'; i++; }
95
+ else if (next == 't') { result += '\t'; i++; }
96
+ else { result += jsonResponse[i]; }
97
+ } else if (jsonResponse[i] == '"') {
98
+ break; // End of the text value
99
+ } else {
100
+ result += jsonResponse[i];
101
+ }
102
+ }
103
+ return result;
104
+ }
105
+ }
106
+
107
+ // Try simple string format: "content":"..."
108
+ std::string contentMarker = "\"content\":\"";
109
+ pos = jsonResponse.find(contentMarker);
110
+ if (pos != std::string::npos) {
111
+ pos += contentMarker.length();
112
+ std::string result;
113
+ for (size_t i = pos; i < jsonResponse.size(); i++) {
114
+ if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
115
+ char next = jsonResponse[i + 1];
116
+ if (next == '"') { result += '"'; i++; }
117
+ else if (next == '\\') { result += '\\'; i++; }
118
+ else if (next == 'n') { result += '\n'; i++; }
119
+ else { result += jsonResponse[i]; }
120
+ } else if (jsonResponse[i] == '"') {
121
+ break;
122
+ } else {
123
+ result += jsonResponse[i];
124
+ }
125
+ }
126
+ return result;
127
+ }
128
+
129
+ // Fallback: return full response
130
+ return jsonResponse;
32
131
  }
33
132
 
34
- //------------------------------------------------------------------------------
35
- // Helper: Create a new Conversation from existing Engine
36
- //------------------------------------------------------------------------------
133
+ // =============================================================================
134
+ // Conversation Management
135
+ // =============================================================================
136
+
37
137
  void HybridLiteRTLM::createNewConversation() {
38
- #ifdef LITERT_LM_ENABLED
138
+ #ifdef __APPLE__
39
139
  if (!engine_) {
40
140
  throw std::runtime_error("Cannot create conversation: engine not initialized");
41
141
  }
42
142
 
43
- auto conversation_config = litert::lm::ConversationConfig::CreateDefault(*engine_);
44
- if (!conversation_config.ok()) {
45
- throw std::runtime_error("Failed to create conversation config: " +
46
- std::string(conversation_config.status().message()));
143
+ // Clean up previous conversation
144
+ if (conversation_) {
145
+ litert_lm_conversation_delete(conversation_);
146
+ conversation_ = nullptr;
147
+ }
148
+ if (conv_config_) {
149
+ litert_lm_conversation_config_delete(conv_config_);
150
+ conv_config_ = nullptr;
151
+ }
152
+
153
+ // Build system message JSON if provided
154
+ std::string systemMsgJson;
155
+ const char* systemMsgPtr = nullptr;
156
+ if (!systemPrompt_.empty()) {
157
+ systemMsgJson = "{\"role\":\"system\",\"content\":\"" + escapeJson(systemPrompt_) + "\"}";
158
+ systemMsgPtr = systemMsgJson.c_str();
159
+ }
160
+
161
+ // Create conversation config with session config
162
+ conv_config_ = litert_lm_conversation_config_create(
163
+ engine_,
164
+ session_config_, // may be nullptr for defaults
165
+ systemMsgPtr, // system message
166
+ nullptr, // tools (not used yet)
167
+ nullptr, // messages history
168
+ false // constrained decoding
169
+ );
170
+ if (!conv_config_) {
171
+ throw std::runtime_error("Failed to create conversation config");
47
172
  }
48
173
 
49
- auto conversation = litert::lm::Conversation::Create(*engine_, *conversation_config);
50
- if (!conversation.ok()) {
51
- throw std::runtime_error("Failed to create conversation: " +
52
- std::string(conversation.status().message()));
174
+ // Create conversation
175
+ conversation_ = litert_lm_conversation_create(engine_, conv_config_);
176
+ if (!conversation_) {
177
+ litert_lm_conversation_config_delete(conv_config_);
178
+ conv_config_ = nullptr;
179
+ throw std::runtime_error("Failed to create conversation");
53
180
  }
54
- conversation_ = std::move(*conversation);
55
181
  #endif
56
182
  }
57
183
 
58
- //------------------------------------------------------------------------------
59
- // loadModel - Initialize Engine and Conversation
60
- //------------------------------------------------------------------------------
61
- void HybridLiteRTLM::loadModel(
184
+ // =============================================================================
185
+ // loadModel
186
+ // =============================================================================
187
+
188
+ std::shared_ptr<Promise<void>> HybridLiteRTLM::loadModel(
189
+ const std::string& modelPath,
190
+ const std::optional<LLMConfig>& config) {
191
+ return Promise<void>::async([this, modelPath, config]() {
192
+ loadModelInternal(modelPath, config);
193
+ });
194
+ }
195
+
196
+ void HybridLiteRTLM::loadModelInternal(
62
197
  const std::string& modelPath,
63
198
  const std::optional<LLMConfig>& config) {
64
199
 
65
200
  std::lock_guard<std::mutex> lock(mutex_);
66
201
 
67
- // Clean up existing resources
68
202
  if (isLoaded_) {
69
- isLoaded_ = false;
70
- history_.clear();
71
- #ifdef LITERT_LM_ENABLED
72
- conversation_.reset();
73
- engine_.reset();
74
- #endif
203
+ close();
75
204
  }
76
205
 
77
- // Apply configuration
78
206
  if (config.has_value()) {
79
207
  if (config->backend.has_value()) {
80
208
  backend_ = config->backend.value();
81
209
  }
82
- if (config->visionBackend.has_value()) {
83
- visionBackend_ = config->visionBackend.value();
84
- }
85
- if (config->audioBackend.has_value()) {
86
- audioBackend_ = config->audioBackend.value();
87
- }
88
210
  if (config->temperature.has_value()) {
89
211
  temperature_ = config->temperature.value();
90
212
  }
@@ -97,441 +219,518 @@ void HybridLiteRTLM::loadModel(
97
219
  if (config->maxTokens.has_value()) {
98
220
  maxTokens_ = config->maxTokens.value();
99
221
  }
222
+ if (config->systemPrompt.has_value()) {
223
+ systemPrompt_ = config->systemPrompt.value();
224
+ }
100
225
  }
101
226
 
102
- #ifdef LITERT_LM_ENABLED
103
- // 1. Create ModelAssets from model path
104
- auto model_assets = litert::lm::ModelAssets::Create(modelPath);
105
- if (!model_assets.ok()) {
106
- throw std::runtime_error("Failed to load model assets: " +
107
- std::string(model_assets.status().message()));
108
- }
109
-
110
- // 2. Map our Backend enum to LiteRT-LM Backend enum
111
- auto engine_backend = (backend_ == Backend::GPU)
112
- ? litert::lm::Backend::GPU
113
- : litert::lm::Backend::CPU;
114
- auto vision_backend = (visionBackend_ == Backend::GPU)
115
- ? litert::lm::Backend::GPU
116
- : litert::lm::Backend::CPU;
117
- auto audio_backend = (audioBackend_ == Backend::GPU)
118
- ? litert::lm::Backend::GPU
119
- : litert::lm::Backend::CPU;
120
-
121
- // 3. Create EngineSettings with all backends
122
- auto engine_settings = litert::lm::EngineSettings::CreateDefault(
123
- *model_assets,
124
- engine_backend,
125
- vision_backend,
126
- audio_backend
127
- );
227
+ #ifdef __APPLE__
228
+ // Set log verbosity: 2=WARNING (production), 0=INFO (debug)
229
+ litert_lm_set_min_log_level(2);
128
230
 
129
- // 4. Create the Engine (heavyweight - loads model weights)
130
- auto engine = litert::lm::Engine::CreateEngine(engine_settings);
131
- if (!engine.ok()) {
132
- throw std::runtime_error("Failed to create engine: " +
133
- std::string(engine.status().message()));
231
+ auto backendStr = [](Backend b) -> const char* {
232
+ switch (b) {
233
+ case Backend::GPU: return "gpu";
234
+ case Backend::NPU: return "gpu"; // NPU not available on iOS, use GPU
235
+ default: return "cpu";
236
+ }
237
+ };
238
+
239
+ auto tryCreateEngine = [&](const char* backend, const char* visionBackend) -> bool {
240
+ auto* settings = litert_lm_engine_settings_create(
241
+ modelPath.c_str(),
242
+ backend,
243
+ visionBackend,
244
+ "cpu" // audio always on CPU
245
+ );
246
+ if (!settings) {
247
+ return false;
248
+ }
249
+
250
+ litert_lm_engine_settings_set_max_num_tokens(settings, static_cast<int>(maxTokens_));
251
+ litert_lm_engine_settings_enable_benchmark(settings);
252
+
253
+ engine_ = litert_lm_engine_create(settings);
254
+ litert_lm_engine_settings_delete(settings);
255
+
256
+ return engine_ != nullptr;
257
+ };
258
+
259
+ // Try requested backend first (e.g. gpu/gpu)
260
+ const char* primaryBackend = backendStr(backend_);
261
+ if (!tryCreateEngine(primaryBackend, primaryBackend)) {
262
+ // Fallback chain for when the primary backend fails:
263
+ bool fallbackOk = false;
264
+ if (backend_ != Backend::CPU) {
265
+ // 1) Try CPU main + GPU vision (model's vision encoder often requires GPU)
266
+ fallbackOk = tryCreateEngine("cpu", "gpu");
267
+ // 2) Try CPU main + CPU vision
268
+ if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", "cpu");
269
+ }
270
+ // 3) Try CPU main + no vision (nullptr skips vision executor entirely)
271
+ if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", nullptr);
272
+ if (fallbackOk) {
273
+ backend_ = Backend::CPU;
274
+ }
275
+ }
276
+
277
+ if (!engine_) {
278
+ throw std::runtime_error(
279
+ "Failed to create LiteRT-LM engine. Tried backend '" +
280
+ std::string(primaryBackend) + "' and CPU fallback. Model path: " + modelPath);
134
281
  }
135
- engine_ = std::move(*engine);
136
-
137
- // 5. Create the Conversation (lightweight - holds KV cache)
138
- createNewConversation();
139
282
 
140
- #endif // LITERT_LM_ENABLED
283
+ session_config_ = litert_lm_session_config_create();
284
+ if (session_config_) {
285
+ litert_lm_session_config_set_max_output_tokens(session_config_, static_cast<int>(maxTokens_));
286
+
287
+ LiteRtLmSamplerParams sampler{};
288
+ sampler.type = kTopP;
289
+ sampler.top_k = static_cast<int32_t>(topK_);
290
+ sampler.top_p = static_cast<float>(topP_);
291
+ sampler.temperature = static_cast<float>(temperature_);
292
+ sampler.seed = 0;
293
+ litert_lm_session_config_set_sampler_params(session_config_, &sampler);
294
+ }
295
+
296
+ createNewConversation();
297
+ #endif
141
298
 
142
299
  isLoaded_ = true;
143
300
  history_.clear();
144
-
145
- // Reset stats
146
301
  lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
147
302
  }
148
303
 
149
- //------------------------------------------------------------------------------
150
- // sendMessage - Blocking text inference
151
- //------------------------------------------------------------------------------
152
- std::string HybridLiteRTLM::sendMessage(const std::string& message) {
304
+ // =============================================================================
305
+ // sendMessage Blocking text inference
306
+ // =============================================================================
307
+
308
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessage(const std::string& message) {
309
+ return Promise<std::string>::async([this, message]() -> std::string {
310
+ return sendMessageInternal(message);
311
+ });
312
+ }
313
+
314
+ std::string HybridLiteRTLM::sendMessageInternal(const std::string& message) {
153
315
  std::lock_guard<std::mutex> lock(mutex_);
154
316
  ensureLoaded();
155
317
 
156
- auto startTime = std::chrono::high_resolution_clock::now();
318
+ auto startTime = std::chrono::steady_clock::now();
319
+ std::string result;
157
320
 
158
- // Add user message to history
159
- Message userMessage;
160
- userMessage.role = Role::USER;
161
- userMessage.content = message;
162
- history_.push_back(userMessage);
321
+ #ifdef __APPLE__
322
+ std::string msgJson = buildTextMessageJson(message);
323
+
324
+ auto* response = litert_lm_conversation_send_message(
325
+ conversation_, msgJson.c_str(), nullptr);
326
+
327
+ if (!response) {
328
+ throw std::runtime_error("LiteRT-LM: sendMessage failed");
329
+ }
330
+
331
+ const char* responseStr = litert_lm_json_response_get_string(response);
332
+ if (responseStr) {
333
+ result = extractTextFromResponse(std::string(responseStr));
334
+ }
335
+ litert_lm_json_response_delete(response);
336
+
337
+ auto* benchInfo = litert_lm_conversation_get_benchmark_info(conversation_);
338
+ if (benchInfo) {
339
+ int numDecodeTurns = litert_lm_benchmark_info_get_num_decode_turns(benchInfo);
340
+ if (numDecodeTurns > 0) {
341
+ int lastIdx = numDecodeTurns - 1;
342
+ lastStats_.tokensPerSecond = litert_lm_benchmark_info_get_decode_tokens_per_sec_at(benchInfo, lastIdx);
343
+ lastStats_.completionTokens = static_cast<double>(
344
+ litert_lm_benchmark_info_get_decode_token_count_at(benchInfo, lastIdx));
345
+ }
346
+ lastStats_.timeToFirstToken = litert_lm_benchmark_info_get_time_to_first_token(benchInfo);
347
+ litert_lm_benchmark_info_delete(benchInfo);
348
+ }
349
+ #else
350
+ // Non-Apple stub
351
+ result = "[iOS only] LiteRT-LM inference not available on this platform.";
352
+ #endif
163
353
 
164
- std::string responseText;
354
+ auto endTime = std::chrono::steady_clock::now();
355
+ double latencyMs = std::chrono::duration<double, std::milli>(endTime - startTime).count();
356
+ lastStats_.totalTime = latencyMs / 1000.0;
165
357
 
166
- #ifdef LITERT_LM_ENABLED
167
- // Build the message struct for LiteRT-LM
168
- // The Conversation API expects a structured input
169
- litert::lm::UserMessage lm_message;
170
- lm_message.role = "user";
171
- lm_message.content = message;
358
+ // Update history
359
+ history_.push_back(Message{Role::USER, message});
360
+ history_.push_back(Message{Role::MODEL, result});
172
361
 
173
- auto response = conversation_->SendMessage(lm_message);
174
- if (!response.ok()) {
175
- // Remove the user message we just added since inference failed
176
- history_.pop_back();
177
- throw std::runtime_error("Inference failed: " +
178
- std::string(response.status().message()));
362
+ return result;
363
+ }
364
+
365
+ // =============================================================================
366
+ // sendMessageAsync Streaming text inference
367
+ // =============================================================================
368
+
369
+ void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
370
+ bool is_final, const char* error_msg) {
371
+ auto* ctx = static_cast<StreamContext*>(callback_data);
372
+
373
+ if (error_msg) {
374
+ // Error occurred — notify JS and clean up
375
+ ctx->onToken(std::string("Error: ") + error_msg, true);
376
+ delete ctx;
377
+ return;
179
378
  }
180
379
 
181
- responseText = response->content;
380
+ if (is_final) {
381
+ // Calculate stats
382
+ auto endTime = std::chrono::steady_clock::now();
383
+ double durationMs = std::chrono::duration<double, std::milli>(endTime - ctx->startTime).count();
384
+
385
+ if (ctx->lastStats && ctx->tokenCount > 0) {
386
+ ctx->lastStats->completionTokens = static_cast<double>(ctx->tokenCount);
387
+ ctx->lastStats->totalTime = durationMs / 1000.0;
388
+ ctx->lastStats->tokensPerSecond = (ctx->tokenCount / durationMs) * 1000.0;
389
+ }
390
+
391
+ // Update history (thread-safe)
392
+ {
393
+ std::lock_guard<std::mutex> lock(*ctx->historyMutex);
394
+ ctx->history->push_back(Message{Role::USER, ctx->userMessage});
395
+ ctx->history->push_back(Message{Role::MODEL, ctx->fullResponse});
396
+ }
397
+
398
+ ctx->onToken("", true);
399
+ delete ctx;
400
+ return;
401
+ }
182
402
 
183
- // Update stats from response if available
184
- if (response->stats.has_value()) {
185
- const auto& stats = response->stats.value();
186
- lastStats_.promptTokens = static_cast<double>(stats.prompt_tokens);
187
- lastStats_.completionTokens = static_cast<double>(stats.completion_tokens);
188
- lastStats_.totalTokens = lastStats_.promptTokens + lastStats_.completionTokens;
189
- lastStats_.timeToFirstToken = stats.time_to_first_token_ms;
190
- lastStats_.totalTime = stats.total_time_ms;
191
- lastStats_.tokensPerSecond = (lastStats_.totalTime > 0)
192
- ? lastStats_.completionTokens / (lastStats_.totalTime / 1000.0)
193
- : 0.0;
403
+ if (chunk) {
404
+ std::string token(chunk);
405
+ ctx->fullResponse += token;
406
+ ctx->tokenCount++;
407
+ ctx->onToken(token, false);
194
408
  }
409
+ }
410
+
411
+ void HybridLiteRTLM::sendMessageAsync(
412
+ const std::string& message,
413
+ const std::function<void(const std::string&, bool)>& onToken) {
414
+
415
+ // Copy values for the background thread (avoid use-after-free)
416
+ auto onTokenCopy = onToken;
417
+ auto messageCopy = message;
418
+
419
+ // Capture shared state safely
420
+ auto* ctx = new StreamContext();
421
+ ctx->onToken = std::move(onTokenCopy);
422
+ ctx->fullResponse = "";
423
+ ctx->history = &history_;
424
+ ctx->historyMutex = &mutex_;
425
+ ctx->userMessage = messageCopy;
426
+ ctx->lastStats = &lastStats_;
427
+ ctx->startTime = std::chrono::steady_clock::now();
428
+ ctx->tokenCount = 0;
429
+
430
+ #ifdef __APPLE__
431
+ ensureLoaded();
195
432
 
196
- #else
197
- // Stub response when LiteRT-LM is not available
198
- responseText = "[LiteRT-LM Stub] Model response placeholder. "
199
- "Real inference will be available when LiteRT-LM libraries are integrated. "
200
- "You said: " + message;
201
-
202
- auto endTime = std::chrono::high_resolution_clock::now();
203
- auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
204
-
205
- // Estimate stats for stub
206
- lastStats_.promptTokens = static_cast<double>(message.length() / 4);
207
- lastStats_.completionTokens = static_cast<double>(responseText.length() / 4);
208
- lastStats_.totalTokens = lastStats_.promptTokens + lastStats_.completionTokens;
209
- lastStats_.totalTime = static_cast<double>(duration);
210
- lastStats_.timeToFirstToken = lastStats_.totalTime / 2;
211
- lastStats_.tokensPerSecond = (lastStats_.totalTime > 0)
212
- ? lastStats_.completionTokens / (lastStats_.totalTime / 1000.0)
213
- : 0;
214
- #endif
433
+ std::string msgJson = buildTextMessageJson(messageCopy);
215
434
 
216
- // Add model response to history
217
- Message modelMessage;
218
- modelMessage.role = Role::MODEL;
219
- modelMessage.content = responseText;
220
- history_.push_back(modelMessage);
435
+ int result = litert_lm_conversation_send_message_stream(
436
+ conversation_, msgJson.c_str(), nullptr,
437
+ streamCallbackFn, ctx);
221
438
 
222
- return responseText;
439
+ if (result != 0) {
440
+ delete ctx;
441
+ throw std::runtime_error("LiteRT-LM: Failed to start streaming inference");
442
+ }
443
+ #else
444
+ // Non-Apple stub
445
+ ctx->onToken("[iOS only] Streaming not available on this platform.", true);
446
+ delete ctx;
447
+ #endif
223
448
  }
224
449
 
225
- //------------------------------------------------------------------------------
226
- // sendMessageWithImage - Multimodal image + text
227
- //------------------------------------------------------------------------------
228
- std::string HybridLiteRTLM::sendMessageWithImage(
450
+ // =============================================================================
451
+ // sendMessageWithImage Multimodal (vision)
452
+ // =============================================================================
453
+
454
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithImage(
455
+ const std::string& message,
456
+ const std::string& imagePath) {
457
+ return Promise<std::string>::async([this, message, imagePath]() -> std::string {
458
+ return sendMessageWithImageInternal(message, imagePath);
459
+ });
460
+ }
461
+
462
+ std::string HybridLiteRTLM::sendMessageWithImageInternal(
229
463
  const std::string& message,
230
464
  const std::string& imagePath) {
231
465
 
232
466
  std::lock_guard<std::mutex> lock(mutex_);
233
467
  ensureLoaded();
234
468
 
235
- #ifdef LITERT_LM_ENABLED
236
- // Load image using stb_image
237
- int width, height, channels;
238
- unsigned char* img = stbi_load(imagePath.c_str(), &width, &height, &channels, 3); // Force 3 channels (RGB)
239
- if (img == nullptr) {
240
- throw std::runtime_error("Failed to load image from path: " + imagePath);
241
- }
242
-
243
- // Create input tensor/buffer for the engine.
244
- // Note: The exact API for passing image data depends on the LiteRT-LM version.
245
- // Assuming a structure that accepts raw bytes and dimensions.
246
- litert::lm::UserMessage lm_message;
247
- lm_message.role = "user";
248
-
249
- // Construct multimodal content
250
- // Option A: If UserMessage supports a list of content parts
251
- litert::lm::ContentPart textPart;
252
- textPart.type = litert::lm::ContentType::TEXT;
253
- textPart.text = message;
254
- lm_message.parts.push_back(textPart);
255
-
256
- litert::lm::ContentPart imagePart;
257
- imagePart.type = litert::lm::ContentType::IMAGE;
258
- imagePart.image.width = width;
259
- imagePart.image.height = height;
260
- imagePart.image.channels = channels;
261
- imagePart.image.data = std::vector<uint8_t>(img, img + (width * height * channels));
262
- lm_message.parts.push_back(imagePart);
469
+ auto startTime = std::chrono::steady_clock::now();
470
+ std::string result;
263
471
 
264
- stbi_image_free(img);
265
-
266
- auto response = conversation_->SendMessage(lm_message);
267
- if (!response.ok()) {
268
- throw std::runtime_error("Multimodal inference failed: " +
269
- std::string(response.status().message()));
472
+ #ifdef __APPLE__
473
+ // Verify image exists
474
+ std::ifstream imageFile(imagePath);
475
+ if (!imageFile.good()) {
476
+ throw std::runtime_error("Image file not found: " + imagePath);
270
477
  }
478
+ imageFile.close();
271
479
 
272
- // Add to history (metadata only)
273
- Message userMessage;
274
- userMessage.role = Role::USER;
275
- userMessage.content = message + " [Image]";
276
- history_.push_back(userMessage);
480
+ // Build multimodal message JSON — the C API handles image preprocessing
481
+ std::string msgJson = buildImageMessageJson(message, imagePath);
277
482
 
278
- Message modelMessage;
279
- modelMessage.role = Role::MODEL;
280
- modelMessage.content = response->content;
281
- history_.push_back(modelMessage);
483
+ auto* response = litert_lm_conversation_send_message(
484
+ conversation_, msgJson.c_str(), nullptr);
282
485
 
283
- return response->content;
486
+ if (!response) {
487
+ throw std::runtime_error("LiteRT-LM: sendMessageWithImage failed");
488
+ }
284
489
 
490
+ const char* responseStr = litert_lm_json_response_get_string(response);
491
+ if (responseStr) {
492
+ result = extractTextFromResponse(std::string(responseStr));
493
+ }
494
+ litert_lm_json_response_delete(response);
285
495
  #else
286
- // iOS: LiteRT-LM SDK not yet available, throw clear error
287
- throw std::runtime_error(
288
- "sendMessageWithImage is not supported on iOS. "
289
- "LiteRT-LM iOS SDK is not yet available. "
290
- "Please use text-only sendMessage() for now.");
496
+ result = "[iOS only] Vision inference not available on this platform.";
291
497
  #endif
498
+
499
+ auto endTime = std::chrono::steady_clock::now();
500
+ lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
501
+
502
+ history_.push_back(Message{Role::USER, message + " [image: " + imagePath + "]"});
503
+ history_.push_back(Message{Role::MODEL, result});
504
+
505
+ return result;
292
506
  }
293
507
 
294
- #endif
295
- }
508
+ // =============================================================================
509
+ // sendMessageWithAudio — Multimodal (audio)
510
+ // =============================================================================
296
511
 
297
- //------------------------------------------------------------------------------
298
- // downloadModel - Download model file from URL
299
- //------------------------------------------------------------------------------
300
- std::future<std::string> HybridLiteRTLM::downloadModel(
301
- const std::string& url,
302
- const std::string& fileName,
303
- const std::optional<std::function<void(double)>>& onProgress) {
304
-
305
- // Return a future that throws an exception
306
- return std::async(std::launch::async, []() -> std::string {
307
- throw std::runtime_error(
308
- "downloadModel is not supported on iOS yet. "
309
- "Please download the model manually using a separate library."
310
- );
512
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithAudio(
513
+ const std::string& message,
514
+ const std::string& audioPath) {
515
+ return Promise<std::string>::async([this, message, audioPath]() -> std::string {
516
+ return sendMessageWithAudioInternal(message, audioPath);
311
517
  });
312
518
  }
313
519
 
314
- //------------------------------------------------------------------------------
315
- // sendMessageWithAudio - Multimodal audio + text
316
- //------------------------------------------------------------------------------
317
- std::string HybridLiteRTLM::sendMessageWithAudio(
520
+ std::string HybridLiteRTLM::sendMessageWithAudioInternal(
318
521
  const std::string& message,
319
522
  const std::string& audioPath) {
320
523
 
321
524
  std::lock_guard<std::mutex> lock(mutex_);
322
525
  ensureLoaded();
323
526
 
324
- #ifdef LITERT_LM_ENABLED
325
- // Load audio file
326
- std::ifstream audioFile(audioPath, std::ios::binary);
327
- if (!audioFile) {
328
- throw std::runtime_error("Failed to open audio file: " + audioPath);
527
+ auto startTime = std::chrono::steady_clock::now();
528
+ std::string result;
529
+
530
+ #ifdef __APPLE__
531
+ std::ifstream audioFile(audioPath);
532
+ if (!audioFile.good()) {
533
+ throw std::runtime_error("Audio file not found: " + audioPath);
329
534
  }
535
+ audioFile.close();
330
536
 
331
- // Simple WAV header skip (simplistic, assuming standard header size for now or raw)
332
- // Ideally use a WAV parsing library or miniaudio if available.
333
- // For this implementation, we read the whole file.
334
- std::vector<uint8_t> audioData((std::istreambuf_iterator<char>(audioFile)), std::istreambuf_iterator<char>());
537
+ std::string msgJson = buildAudioMessageJson(message, audioPath);
335
538
 
336
- litert::lm::UserMessage lm_message;
337
- lm_message.role = "user";
539
+ auto* response = litert_lm_conversation_send_message(
540
+ conversation_, msgJson.c_str(), nullptr);
338
541
 
339
- litert::lm::ContentPart textPart;
340
- textPart.type = litert::lm::ContentType::TEXT;
341
- textPart.text = message;
342
- lm_message.parts.push_back(textPart);
343
-
344
- litert::lm::ContentPart audioPart;
345
- audioPart.type = litert::lm::ContentType::AUDIO;
346
- audioPart.audio.data = audioData;
347
- // Metadata like sample rate might be needed:
348
- // audioPart.audio.sample_rate = 16000;
349
- lm_message.parts.push_back(audioPart);
350
-
351
- auto response = conversation_->SendMessage(lm_message);
352
- if (!response.ok()) {
353
- throw std::runtime_error("Audio inference failed: " +
354
- std::string(response.status().message()));
542
+ if (!response) {
543
+ throw std::runtime_error("LiteRT-LM: sendMessageWithAudio failed");
355
544
  }
356
545
 
357
- Message userMessage;
358
- userMessage.role = Role::USER;
359
- userMessage.content = message + " [Audio]";
360
- history_.push_back(userMessage);
546
+ const char* responseStr = litert_lm_json_response_get_string(response);
547
+ if (responseStr) {
548
+ result = extractTextFromResponse(std::string(responseStr));
549
+ }
550
+ litert_lm_json_response_delete(response);
551
+ #else
552
+ result = "[iOS only] Audio inference not available on this platform.";
553
+ #endif
361
554
 
362
- Message modelMessage;
363
- modelMessage.role = Role::MODEL;
364
- modelMessage.content = response->content;
365
- history_.push_back(modelMessage);
555
+ auto endTime = std::chrono::steady_clock::now();
556
+ lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
366
557
 
367
- return response->content;
558
+ history_.push_back(Message{Role::USER, message + " [audio: " + audioPath + "]"});
559
+ history_.push_back(Message{Role::MODEL, result});
368
560
 
561
+ return result;
562
+ }
563
+
564
+ // =============================================================================
565
+ // downloadModel — Download model from URL
566
+ // =============================================================================
567
+
568
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::downloadModel(
569
+ const std::string& url,
570
+ const std::string& fileName,
571
+ const std::optional<std::function<void(double)>>& onProgress) {
572
+ return Promise<std::string>::async([url, fileName, onProgress]() -> std::string {
573
+ #ifdef __APPLE__
574
+ return litert_lm::downloadModelFile(url, fileName, onProgress);
369
575
  #else
370
- // iOS: LiteRT-LM SDK not yet available, throw clear error
371
- throw std::runtime_error(
372
- "sendMessageWithAudio is not supported on iOS. "
373
- "LiteRT-LM iOS SDK is not yet available. "
374
- "Please use text-only sendMessage() for now.");
576
+ std::string destPath = "/tmp/" + fileName;
577
+ std::string curlCmd = "curl -L -o \"" + destPath + "\" \"" + url + "\"";
578
+ int result = system(curlCmd.c_str());
579
+ if (result != 0) {
580
+ throw std::runtime_error("Failed to download model from: " + url);
581
+ }
582
+ if (onProgress.has_value()) {
583
+ onProgress.value()(1.0);
584
+ }
585
+ return destPath;
375
586
  #endif
587
+ });
376
588
  }
377
589
 
378
- //------------------------------------------------------------------------------
379
- // sendMessageAsync - Streaming token generation
380
- //------------------------------------------------------------------------------
381
- void HybridLiteRTLM::sendMessageAsync(
382
- const std::string& message,
383
- const std::function<void(std::string, bool)>& onToken) {
384
-
385
- // Note: We don't hold the lock during the entire async operation
386
- // to avoid blocking other operations. The callback may be invoked
387
- // from a different thread depending on LiteRT-LM's implementation.
388
-
389
- {
390
- std::lock_guard<std::mutex> lock(mutex_);
391
- ensureLoaded();
392
- }
393
-
394
- #ifdef LITERT_LM_ENABLED
395
- // Add user message to history before starting
396
- {
397
- std::lock_guard<std::mutex> lock(mutex_);
398
- Message userMessage;
399
- userMessage.role = Role::USER;
400
- userMessage.content = message;
401
- history_.push_back(userMessage);
402
- }
403
-
404
- litert::lm::UserMessage lm_message;
405
- lm_message.role = "user";
406
- lm_message.content = message;
407
-
408
- std::string fullResponse;
409
-
410
- // The callback needs to be carefully managed for thread safety
411
- auto status = conversation_->SendMessageAsync(
412
- lm_message,
413
- [this, &onToken, &fullResponse](const std::string& token, bool isDone) {
414
- fullResponse += token;
415
-
416
- // Invoke the JS callback (Nitro handles thread marshalling)
417
- onToken(token, isDone);
418
-
419
- if (isDone) {
420
- // Add complete response to history
421
- std::lock_guard<std::mutex> lock(mutex_);
422
- Message modelMessage;
423
- modelMessage.role = Role::MODEL;
424
- modelMessage.content = fullResponse;
425
- history_.push_back(modelMessage);
426
- }
427
- }
428
- );
429
-
430
- if (!status.ok()) {
431
- // Remove user message since inference failed
432
- std::lock_guard<std::mutex> lock(mutex_);
433
- if (!history_.empty()) {
434
- history_.pop_back();
590
+ std::shared_ptr<Promise<void>> HybridLiteRTLM::deleteModel(const std::string& fileName) {
591
+ return Promise<void>::async([fileName]() {
592
+ std::string path;
593
+ #ifdef __APPLE__
594
+ // Match the path used by IOSDownloadHelper: ~/Library/Caches/litert_models/
595
+ const char* home = getenv("HOME");
596
+ if (home) {
597
+ path = std::string(home) + "/Library/Caches/litert_models/" + fileName;
435
598
  }
436
- throw std::runtime_error("Async inference failed: " +
437
- std::string(status.message()));
438
- }
439
-
440
599
  #else
441
- // Stub: Simulate streaming by calling sendMessage and splitting response
442
- std::string fullResponse;
443
- {
444
- std::lock_guard<std::mutex> lock(mutex_);
445
-
446
- // Add user message
447
- Message userMessage;
448
- userMessage.role = Role::USER;
449
- userMessage.content = message;
450
- history_.push_back(userMessage);
451
-
452
- fullResponse = "[LiteRT-LM Stub] Streaming response placeholder. You said: " + message;
453
- }
454
-
455
- // Simulate token-by-token streaming
456
- std::string currentWord;
457
- for (size_t i = 0; i < fullResponse.length(); i++) {
458
- char c = fullResponse[i];
459
- currentWord += c;
460
-
461
- if (c == ' ' || c == '\n' || i == fullResponse.length() - 1) {
462
- bool isDone = (i == fullResponse.length() - 1);
463
- onToken(currentWord, isDone);
464
- currentWord.clear();
465
- }
466
- }
467
-
468
- // Add model response to history
469
- {
470
- std::lock_guard<std::mutex> lock(mutex_);
471
- Message modelMessage;
472
- modelMessage.role = Role::MODEL;
473
- modelMessage.content = fullResponse;
474
- history_.push_back(modelMessage);
475
- }
600
+ path = "/tmp/" + fileName;
476
601
  #endif
602
+ if (!path.empty()) {
603
+ std::remove(path.c_str());
604
+ }
605
+ });
477
606
  }
478
607
 
479
- //------------------------------------------------------------------------------
480
- // getHistory - Return conversation history
481
- //------------------------------------------------------------------------------
608
+ // =============================================================================
609
+ // getHistory
610
+ // =============================================================================
611
+
482
612
  std::vector<Message> HybridLiteRTLM::getHistory() {
483
613
  std::lock_guard<std::mutex> lock(mutex_);
484
614
  return history_;
485
615
  }
486
616
 
487
- //------------------------------------------------------------------------------
488
- // resetConversation - Clear KV cache, keep engine
489
- //------------------------------------------------------------------------------
617
+ // =============================================================================
618
+ // resetConversation
619
+ // =============================================================================
620
+
490
621
  void HybridLiteRTLM::resetConversation() {
491
622
  std::lock_guard<std::mutex> lock(mutex_);
492
623
 
493
- #ifdef LITERT_LM_ENABLED
494
- // Destroy old conversation and create a new one
495
- // This clears the KV cache but keeps the (expensive) Engine loaded
496
- if (engine_) {
497
- conversation_.reset();
624
+ history_.clear();
625
+ lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
626
+
627
+ #ifdef __APPLE__
628
+ if (isLoaded_ && engine_) {
498
629
  createNewConversation();
499
630
  }
500
631
  #endif
501
-
502
- history_.clear();
503
632
  }
504
633
 
505
- //------------------------------------------------------------------------------
506
- // isReady - Check if model is loaded
507
- //------------------------------------------------------------------------------
634
+ // =============================================================================
635
+ // isReady
636
+ // =============================================================================
637
+
508
638
  bool HybridLiteRTLM::isReady() {
509
639
  std::lock_guard<std::mutex> lock(mutex_);
510
640
  return isLoaded_;
511
641
  }
512
642
 
513
- //------------------------------------------------------------------------------
514
- // getStats - Return last generation statistics
515
- //------------------------------------------------------------------------------
643
+ // =============================================================================
644
+ // getStats
645
+ // =============================================================================
646
+
516
647
  GenerationStats HybridLiteRTLM::getStats() {
517
648
  std::lock_guard<std::mutex> lock(mutex_);
518
649
  return lastStats_;
519
650
  }
520
651
 
521
- //------------------------------------------------------------------------------
522
- // close - Release all native resources
523
- //------------------------------------------------------------------------------
524
- void HybridLiteRTLM::close() {
525
- std::lock_guard<std::mutex> lock(mutex_);
652
+ // =============================================================================
653
+ // getMemoryUsage Uses Mach APIs for iOS process memory
654
+ // =============================================================================
655
+
656
+ MemoryUsage HybridLiteRTLM::getMemoryUsage() {
657
+ double usedMemoryBytes = 0;
658
+ double totalMemoryBytes = 0;
659
+ double availableBytes = 0;
660
+ bool isLowMemory = false;
661
+
662
+ #ifdef __APPLE__
663
+ // Get app process memory (resident set size)
664
+ struct mach_task_basic_info info;
665
+ mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
666
+
667
+ kern_return_t kr = task_info(mach_task_self(),
668
+ MACH_TASK_BASIC_INFO,
669
+ (task_info_t)&info,
670
+ &count);
671
+
672
+ if (kr == KERN_SUCCESS) {
673
+ usedMemoryBytes = static_cast<double>(info.resident_size);
674
+ }
675
+
676
+ // Get total physical memory
677
+ mach_port_t host_port = mach_host_self();
678
+ struct host_basic_info hostInfo;
679
+ mach_msg_type_number_t hostCount = HOST_BASIC_INFO_COUNT;
680
+
681
+ kr = host_info(host_port, HOST_BASIC_INFO,
682
+ (host_info_t)&hostInfo, &hostCount);
526
683
 
527
- #ifdef LITERT_LM_ENABLED
528
- // Release in reverse order of creation
529
- conversation_.reset();
530
- engine_.reset();
684
+ if (kr == KERN_SUCCESS) {
685
+ totalMemoryBytes = static_cast<double>(hostInfo.max_mem);
686
+ }
687
+
688
+ availableBytes = totalMemoryBytes - usedMemoryBytes;
689
+ if (availableBytes < 0) availableBytes = 0;
690
+
691
+ // Low memory threshold (~200MB available)
692
+ isLowMemory = (totalMemoryBytes > 0) && (availableBytes < 200.0 * 1024.0 * 1024.0);
531
693
  #endif
532
694
 
695
+ return MemoryUsage{
696
+ usedMemoryBytes, // nativeHeapBytes
697
+ usedMemoryBytes, // residentBytes
698
+ availableBytes, // availableMemoryBytes
699
+ isLowMemory // isLowMemory
700
+ };
701
+ }
702
+
703
+ // =============================================================================
704
+ // close — Clean up all LiteRT-LM resources
705
+ // =============================================================================
706
+
707
+ void HybridLiteRTLM::close() {
708
+ // Note: Don't lock here if called from destructor (mutex may be destroyed)
709
+ // The caller (loadModel, destructor) should handle locking.
710
+
533
711
  isLoaded_ = false;
534
712
  history_.clear();
713
+
714
+ #ifdef __APPLE__
715
+ if (conversation_) {
716
+ litert_lm_conversation_delete(conversation_);
717
+ conversation_ = nullptr;
718
+ }
719
+ if (conv_config_) {
720
+ litert_lm_conversation_config_delete(conv_config_);
721
+ conv_config_ = nullptr;
722
+ }
723
+ if (session_config_) {
724
+ litert_lm_session_config_delete(session_config_);
725
+ session_config_ = nullptr;
726
+ }
727
+ if (engine_) {
728
+ litert_lm_engine_delete(engine_);
729
+ engine_ = nullptr;
730
+ }
731
+ #endif
732
+
733
+ lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
535
734
  }
536
735
 
537
736
  } // namespace margelo::nitro::litertlm