react-native-litert-lm 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +270 -186
  2. package/android/build.gradle +1 -1
  3. package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +93 -37
  4. package/app.plugin.js +33 -0
  5. package/cpp/HybridLiteRTLM.cpp +571 -451
  6. package/cpp/HybridLiteRTLM.hpp +54 -23
  7. package/cpp/IOSDownloadHelper.h +24 -0
  8. package/cpp/cpp-adapter.cpp +2 -2
  9. package/cpp/include/litert_lm_engine.h +502 -0
  10. package/ios/IOSDownloadHelper.mm +129 -0
  11. package/ios/LiteRTLMAutolinking.mm +30 -0
  12. package/lib/hooks.d.ts +9 -4
  13. package/lib/hooks.js +34 -20
  14. package/lib/index.d.ts +1 -0
  15. package/lib/index.js +2 -5
  16. package/lib/memoryTracker.d.ts +1 -1
  17. package/lib/memoryTracker.js +1 -1
  18. package/lib/modelFactory.d.ts +11 -5
  19. package/lib/modelFactory.js +9 -4
  20. package/nitrogen/generated/android/LiteRTLMOnLoad.cpp +11 -4
  21. package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.cpp +31 -37
  22. package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.hpp +19 -22
  23. package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLMSpec.kt +15 -18
  24. package/package.json +12 -5
  25. package/react-native-litert-lm.podspec +20 -7
  26. package/scripts/build-ios-engine.sh +283 -0
  27. package/scripts/download-ios-frameworks.sh +72 -0
  28. package/scripts/postinstall.js +116 -0
  29. package/scripts/stubs/cxx_bridge_stubs.cc +224 -0
  30. package/scripts/stubs/gemma_model_constraint_provider.cc +46 -0
  31. package/scripts/stubs/llguidance_stubs.c +101 -0
  32. package/src/hooks.ts +62 -39
  33. package/src/index.ts +4 -7
  34. package/src/memoryTracker.ts +1 -1
  35. package/src/modelFactory.ts +30 -5
@@ -2,7 +2,7 @@
2
2
  // HybridLiteRTLM.cpp
3
3
  // react-native-litert-lm
4
4
  //
5
- // High-performance LLM inference using LiteRT-LM.
5
+ // High-performance LLM inference using LiteRT-LM C API.
6
6
  //
7
7
  // NOTE: This C++ implementation is used for iOS ONLY.
8
8
  // Android uses the Kotlin implementation in `android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt`.
@@ -11,80 +11,202 @@
11
11
 
12
12
  #include "HybridLiteRTLM.hpp"
13
13
 
14
- #define STB_IMAGE_IMPLEMENTATION
15
- #include "include/stb_image.h"
16
14
 
15
+
16
+
17
+ #include <NitroModules/Promise.hpp>
17
18
  #include <chrono>
18
19
  #include <stdexcept>
19
20
  #include <sstream>
21
+
22
+ #ifdef __APPLE__
23
+ #include "IOSDownloadHelper.h"
24
+ #endif
20
25
  #include <fstream>
26
+ #include <thread>
27
+ #include <regex>
21
28
 
22
29
  namespace margelo::nitro::litertlm {
23
30
 
24
- //------------------------------------------------------------------------------
25
- // Helper: Format user prompt (applies chat template if needed)
26
- //------------------------------------------------------------------------------
27
- std::string HybridLiteRTLM::formatUserPrompt(const std::string& message) const {
28
- // The LiteRT-LM Conversation class handles chat templates internally,
29
- // so we just return the message as-is. If we were using Session directly,
30
- // we'd apply the Gemma/Phi template here.
31
- return message;
31
+ // =============================================================================
32
+ // JSON Helpers
33
+ // =============================================================================
34
+
35
+ std::string HybridLiteRTLM::escapeJson(const std::string& input) {
36
+ std::string output;
37
+ output.reserve(input.size() + 16);
38
+ for (char c : input) {
39
+ switch (c) {
40
+ case '"': output += "\\\""; break;
41
+ case '\\': output += "\\\\"; break;
42
+ case '\n': output += "\\n"; break;
43
+ case '\r': output += "\\r"; break;
44
+ case '\t': output += "\\t"; break;
45
+ case '\b': output += "\\b"; break;
46
+ case '\f': output += "\\f"; break;
47
+ default: output += c; break;
48
+ }
49
+ }
50
+ return output;
51
+ }
52
+
53
+ std::string HybridLiteRTLM::buildTextMessageJson(const std::string& text) {
54
+ return "{\"role\":\"user\",\"content\":\"" + escapeJson(text) + "\"}";
55
+ }
56
+
57
+ std::string HybridLiteRTLM::buildImageMessageJson(const std::string& text, const std::string& imagePath) {
58
+ return "{\"role\":\"user\",\"content\":["
59
+ "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
60
+ "{\"type\":\"image\",\"path\":\"" + escapeJson(imagePath) + "\"}"
61
+ "]}";
32
62
  }
33
63
 
34
- //------------------------------------------------------------------------------
35
- // Helper: Create a new Conversation from existing Engine
36
- //------------------------------------------------------------------------------
64
+ std::string HybridLiteRTLM::buildAudioMessageJson(const std::string& text, const std::string& audioPath) {
65
+ return "{\"role\":\"user\",\"content\":["
66
+ "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
67
+ "{\"type\":\"audio\",\"path\":\"" + escapeJson(audioPath) + "\"}"
68
+ "]}";
69
+ }
70
+
71
+ std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonResponse) {
72
+ // The C API response JSON is structured as:
73
+ // {"role":"model","content":[{"type":"text","text":"..."}]}
74
+ // or:
75
+ // {"role":"model","content":"..."}
76
+ //
77
+ // We use simple string extraction to avoid a JSON library dependency.
78
+
79
+ // Try array format first: find "text":"..." after "type":"text"
80
+ std::string textMarker = "\"text\":\"";
81
+ size_t pos = jsonResponse.find("\"type\":\"text\"");
82
+ if (pos != std::string::npos) {
83
+ pos = jsonResponse.find(textMarker, pos);
84
+ if (pos != std::string::npos) {
85
+ pos += textMarker.length();
86
+ std::string result;
87
+ result.reserve(jsonResponse.size() - pos);
88
+ for (size_t i = pos; i < jsonResponse.size(); i++) {
89
+ if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
90
+ char next = jsonResponse[i + 1];
91
+ if (next == '"') { result += '"'; i++; }
92
+ else if (next == '\\') { result += '\\'; i++; }
93
+ else if (next == 'n') { result += '\n'; i++; }
94
+ else if (next == 'r') { result += '\r'; i++; }
95
+ else if (next == 't') { result += '\t'; i++; }
96
+ else { result += jsonResponse[i]; }
97
+ } else if (jsonResponse[i] == '"') {
98
+ break; // End of the text value
99
+ } else {
100
+ result += jsonResponse[i];
101
+ }
102
+ }
103
+ return result;
104
+ }
105
+ }
106
+
107
+ // Try simple string format: "content":"..."
108
+ std::string contentMarker = "\"content\":\"";
109
+ pos = jsonResponse.find(contentMarker);
110
+ if (pos != std::string::npos) {
111
+ pos += contentMarker.length();
112
+ std::string result;
113
+ for (size_t i = pos; i < jsonResponse.size(); i++) {
114
+ if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
115
+ char next = jsonResponse[i + 1];
116
+ if (next == '"') { result += '"'; i++; }
117
+ else if (next == '\\') { result += '\\'; i++; }
118
+ else if (next == 'n') { result += '\n'; i++; }
119
+ else { result += jsonResponse[i]; }
120
+ } else if (jsonResponse[i] == '"') {
121
+ break;
122
+ } else {
123
+ result += jsonResponse[i];
124
+ }
125
+ }
126
+ return result;
127
+ }
128
+
129
+ // Fallback: return full response
130
+ return jsonResponse;
131
+ }
132
+
133
+ // =============================================================================
134
+ // Conversation Management
135
+ // =============================================================================
136
+
37
137
  void HybridLiteRTLM::createNewConversation() {
38
- #ifdef LITERT_LM_ENABLED
138
+ #ifdef __APPLE__
39
139
  if (!engine_) {
40
140
  throw std::runtime_error("Cannot create conversation: engine not initialized");
41
141
  }
42
142
 
43
- auto conversation_config = litert::lm::ConversationConfig::CreateDefault(*engine_);
44
- if (!conversation_config.ok()) {
45
- throw std::runtime_error("Failed to create conversation config: " +
46
- std::string(conversation_config.status().message()));
143
+ // Clean up previous conversation
144
+ if (conversation_) {
145
+ litert_lm_conversation_delete(conversation_);
146
+ conversation_ = nullptr;
147
+ }
148
+ if (conv_config_) {
149
+ litert_lm_conversation_config_delete(conv_config_);
150
+ conv_config_ = nullptr;
47
151
  }
48
152
 
49
- auto conversation = litert::lm::Conversation::Create(*engine_, *conversation_config);
50
- if (!conversation.ok()) {
51
- throw std::runtime_error("Failed to create conversation: " +
52
- std::string(conversation.status().message()));
153
+ // Build system message JSON if provided
154
+ std::string systemMsgJson;
155
+ const char* systemMsgPtr = nullptr;
156
+ if (!systemPrompt_.empty()) {
157
+ systemMsgJson = "{\"role\":\"system\",\"content\":\"" + escapeJson(systemPrompt_) + "\"}";
158
+ systemMsgPtr = systemMsgJson.c_str();
159
+ }
160
+
161
+ // Create conversation config with session config
162
+ conv_config_ = litert_lm_conversation_config_create(
163
+ engine_,
164
+ session_config_, // may be nullptr for defaults
165
+ systemMsgPtr, // system message
166
+ nullptr, // tools (not used yet)
167
+ nullptr, // messages history
168
+ false // constrained decoding
169
+ );
170
+ if (!conv_config_) {
171
+ throw std::runtime_error("Failed to create conversation config");
172
+ }
173
+
174
+ // Create conversation
175
+ conversation_ = litert_lm_conversation_create(engine_, conv_config_);
176
+ if (!conversation_) {
177
+ litert_lm_conversation_config_delete(conv_config_);
178
+ conv_config_ = nullptr;
179
+ throw std::runtime_error("Failed to create conversation");
53
180
  }
54
- conversation_ = std::move(*conversation);
55
181
  #endif
56
182
  }
57
183
 
58
- //------------------------------------------------------------------------------
59
- // loadModel - Initialize Engine and Conversation
60
- //------------------------------------------------------------------------------
61
- void HybridLiteRTLM::loadModel(
184
+ // =============================================================================
185
+ // loadModel
186
+ // =============================================================================
187
+
188
+ std::shared_ptr<Promise<void>> HybridLiteRTLM::loadModel(
189
+ const std::string& modelPath,
190
+ const std::optional<LLMConfig>& config) {
191
+ return Promise<void>::async([this, modelPath, config]() {
192
+ loadModelInternal(modelPath, config);
193
+ });
194
+ }
195
+
196
+ void HybridLiteRTLM::loadModelInternal(
62
197
  const std::string& modelPath,
63
198
  const std::optional<LLMConfig>& config) {
64
199
 
65
200
  std::lock_guard<std::mutex> lock(mutex_);
66
201
 
67
- // Clean up existing resources
68
202
  if (isLoaded_) {
69
- isLoaded_ = false;
70
- history_.clear();
71
- #ifdef LITERT_LM_ENABLED
72
- conversation_.reset();
73
- engine_.reset();
74
- #endif
203
+ close();
75
204
  }
76
205
 
77
- // Apply configuration
78
206
  if (config.has_value()) {
79
207
  if (config->backend.has_value()) {
80
208
  backend_ = config->backend.value();
81
209
  }
82
- if (config->visionBackend.has_value()) {
83
- visionBackend_ = config->visionBackend.value();
84
- }
85
- if (config->audioBackend.has_value()) {
86
- audioBackend_ = config->audioBackend.value();
87
- }
88
210
  if (config->temperature.has_value()) {
89
211
  temperature_ = config->temperature.value();
90
212
  }
@@ -97,520 +219,518 @@ void HybridLiteRTLM::loadModel(
97
219
  if (config->maxTokens.has_value()) {
98
220
  maxTokens_ = config->maxTokens.value();
99
221
  }
222
+ if (config->systemPrompt.has_value()) {
223
+ systemPrompt_ = config->systemPrompt.value();
224
+ }
100
225
  }
101
226
 
102
- #ifdef LITERT_LM_ENABLED
103
- // 1. Create ModelAssets from model path
104
- auto model_assets = litert::lm::ModelAssets::Create(modelPath);
105
- if (!model_assets.ok()) {
106
- throw std::runtime_error("Failed to load model assets: " +
107
- std::string(model_assets.status().message()));
108
- }
109
-
110
- // 2. Map our Backend enum to LiteRT-LM Backend enum
111
- auto engine_backend = (backend_ == Backend::GPU)
112
- ? litert::lm::Backend::GPU
113
- : litert::lm::Backend::CPU;
114
- auto vision_backend = (visionBackend_ == Backend::GPU)
115
- ? litert::lm::Backend::GPU
116
- : litert::lm::Backend::CPU;
117
- auto audio_backend = (audioBackend_ == Backend::GPU)
118
- ? litert::lm::Backend::GPU
119
- : litert::lm::Backend::CPU;
120
-
121
- // 3. Create EngineSettings with all backends
122
- auto engine_settings = litert::lm::EngineSettings::CreateDefault(
123
- *model_assets,
124
- engine_backend,
125
- vision_backend,
126
- audio_backend
127
- );
128
-
129
- // 4. Create the Engine (heavyweight - loads model weights)
130
- auto engine = litert::lm::Engine::CreateEngine(engine_settings);
131
- if (!engine.ok()) {
132
- throw std::runtime_error("Failed to create engine: " +
133
- std::string(engine.status().message()));
227
+ #ifdef __APPLE__
228
+ // Set log verbosity: 2=WARNING (production), 0=INFO (debug)
229
+ litert_lm_set_min_log_level(2);
230
+
231
+ auto backendStr = [](Backend b) -> const char* {
232
+ switch (b) {
233
+ case Backend::GPU: return "gpu";
234
+ case Backend::NPU: return "gpu"; // NPU not available on iOS, use GPU
235
+ default: return "cpu";
236
+ }
237
+ };
238
+
239
+ auto tryCreateEngine = [&](const char* backend, const char* visionBackend) -> bool {
240
+ auto* settings = litert_lm_engine_settings_create(
241
+ modelPath.c_str(),
242
+ backend,
243
+ visionBackend,
244
+ "cpu" // audio always on CPU
245
+ );
246
+ if (!settings) {
247
+ return false;
248
+ }
249
+
250
+ litert_lm_engine_settings_set_max_num_tokens(settings, static_cast<int>(maxTokens_));
251
+ litert_lm_engine_settings_enable_benchmark(settings);
252
+
253
+ engine_ = litert_lm_engine_create(settings);
254
+ litert_lm_engine_settings_delete(settings);
255
+
256
+ return engine_ != nullptr;
257
+ };
258
+
259
+ // Try requested backend first (e.g. gpu/gpu)
260
+ const char* primaryBackend = backendStr(backend_);
261
+ if (!tryCreateEngine(primaryBackend, primaryBackend)) {
262
+ // Fallback chain for when the primary backend fails:
263
+ bool fallbackOk = false;
264
+ if (backend_ != Backend::CPU) {
265
+ // 1) Try CPU main + GPU vision (model's vision encoder often requires GPU)
266
+ fallbackOk = tryCreateEngine("cpu", "gpu");
267
+ // 2) Try CPU main + CPU vision
268
+ if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", "cpu");
269
+ }
270
+ // 3) Try CPU main + no vision (nullptr skips vision executor entirely)
271
+ if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", nullptr);
272
+ if (fallbackOk) {
273
+ backend_ = Backend::CPU;
274
+ }
275
+ }
276
+
277
+ if (!engine_) {
278
+ throw std::runtime_error(
279
+ "Failed to create LiteRT-LM engine. Tried backend '" +
280
+ std::string(primaryBackend) + "' and CPU fallback. Model path: " + modelPath);
281
+ }
282
+
283
+ session_config_ = litert_lm_session_config_create();
284
+ if (session_config_) {
285
+ litert_lm_session_config_set_max_output_tokens(session_config_, static_cast<int>(maxTokens_));
286
+
287
+ LiteRtLmSamplerParams sampler{};
288
+ sampler.type = kTopP;
289
+ sampler.top_k = static_cast<int32_t>(topK_);
290
+ sampler.top_p = static_cast<float>(topP_);
291
+ sampler.temperature = static_cast<float>(temperature_);
292
+ sampler.seed = 0;
293
+ litert_lm_session_config_set_sampler_params(session_config_, &sampler);
134
294
  }
135
- engine_ = std::move(*engine);
136
-
137
- // 5. Create the Conversation (lightweight - holds KV cache)
138
- createNewConversation();
139
295
 
140
- #endif // LITERT_LM_ENABLED
296
+ createNewConversation();
297
+ #endif
141
298
 
142
299
  isLoaded_ = true;
143
300
  history_.clear();
144
-
145
- // Reset stats
146
301
  lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
147
302
  }
148
303
 
149
- //------------------------------------------------------------------------------
150
- // sendMessage - Blocking text inference
151
- //------------------------------------------------------------------------------
152
- std::string HybridLiteRTLM::sendMessage(const std::string& message) {
304
+ // =============================================================================
305
+ // sendMessage Blocking text inference
306
+ // =============================================================================
307
+
308
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessage(const std::string& message) {
309
+ return Promise<std::string>::async([this, message]() -> std::string {
310
+ return sendMessageInternal(message);
311
+ });
312
+ }
313
+
314
+ std::string HybridLiteRTLM::sendMessageInternal(const std::string& message) {
153
315
  std::lock_guard<std::mutex> lock(mutex_);
154
316
  ensureLoaded();
155
317
 
156
- auto startTime = std::chrono::high_resolution_clock::now();
318
+ auto startTime = std::chrono::steady_clock::now();
319
+ std::string result;
157
320
 
158
- // Add user message to history
159
- Message userMessage;
160
- userMessage.role = Role::USER;
161
- userMessage.content = message;
162
- history_.push_back(userMessage);
163
-
164
- std::string responseText;
321
+ #ifdef __APPLE__
322
+ std::string msgJson = buildTextMessageJson(message);
165
323
 
166
- #ifdef LITERT_LM_ENABLED
167
- // Build the message struct for LiteRT-LM
168
- // The Conversation API expects a structured input
169
- litert::lm::UserMessage lm_message;
170
- lm_message.role = "user";
171
- lm_message.content = message;
324
+ auto* response = litert_lm_conversation_send_message(
325
+ conversation_, msgJson.c_str(), nullptr);
172
326
 
173
- auto response = conversation_->SendMessage(lm_message);
174
- if (!response.ok()) {
175
- // Remove the user message we just added since inference failed
176
- history_.pop_back();
177
- throw std::runtime_error("Inference failed: " +
178
- std::string(response.status().message()));
327
+ if (!response) {
328
+ throw std::runtime_error("LiteRT-LM: sendMessage failed");
179
329
  }
180
330
 
181
- responseText = response->content;
182
-
183
- // Update stats from response if available
184
- if (response->stats.has_value()) {
185
- const auto& stats = response->stats.value();
186
- lastStats_.promptTokens = static_cast<double>(stats.prompt_tokens);
187
- lastStats_.completionTokens = static_cast<double>(stats.completion_tokens);
188
- lastStats_.totalTokens = lastStats_.promptTokens + lastStats_.completionTokens;
189
- lastStats_.timeToFirstToken = stats.time_to_first_token_ms;
190
- lastStats_.totalTime = stats.total_time_ms;
191
- lastStats_.tokensPerSecond = (lastStats_.totalTime > 0)
192
- ? lastStats_.completionTokens / (lastStats_.totalTime / 1000.0)
193
- : 0.0;
331
+ const char* responseStr = litert_lm_json_response_get_string(response);
332
+ if (responseStr) {
333
+ result = extractTextFromResponse(std::string(responseStr));
334
+ }
335
+ litert_lm_json_response_delete(response);
336
+
337
+ auto* benchInfo = litert_lm_conversation_get_benchmark_info(conversation_);
338
+ if (benchInfo) {
339
+ int numDecodeTurns = litert_lm_benchmark_info_get_num_decode_turns(benchInfo);
340
+ if (numDecodeTurns > 0) {
341
+ int lastIdx = numDecodeTurns - 1;
342
+ lastStats_.tokensPerSecond = litert_lm_benchmark_info_get_decode_tokens_per_sec_at(benchInfo, lastIdx);
343
+ lastStats_.completionTokens = static_cast<double>(
344
+ litert_lm_benchmark_info_get_decode_token_count_at(benchInfo, lastIdx));
345
+ }
346
+ lastStats_.timeToFirstToken = litert_lm_benchmark_info_get_time_to_first_token(benchInfo);
347
+ litert_lm_benchmark_info_delete(benchInfo);
194
348
  }
195
-
196
349
  #else
197
- // Stub response when LiteRT-LM is not available
198
- responseText = "[LiteRT-LM Stub] Model response placeholder. "
199
- "Real inference will be available when LiteRT-LM libraries are integrated. "
200
- "You said: " + message;
201
-
202
- auto endTime = std::chrono::high_resolution_clock::now();
203
- auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
204
-
205
- // Estimate stats for stub
206
- lastStats_.promptTokens = static_cast<double>(message.length() / 4);
207
- lastStats_.completionTokens = static_cast<double>(responseText.length() / 4);
208
- lastStats_.totalTokens = lastStats_.promptTokens + lastStats_.completionTokens;
209
- lastStats_.totalTime = static_cast<double>(duration);
210
- lastStats_.timeToFirstToken = lastStats_.totalTime / 2;
211
- lastStats_.tokensPerSecond = (lastStats_.totalTime > 0)
212
- ? lastStats_.completionTokens / (lastStats_.totalTime / 1000.0)
213
- : 0;
350
+ // Non-Apple stub
351
+ result = "[iOS only] LiteRT-LM inference not available on this platform.";
214
352
  #endif
215
353
 
216
- // Add model response to history
217
- Message modelMessage;
218
- modelMessage.role = Role::MODEL;
219
- modelMessage.content = responseText;
220
- history_.push_back(modelMessage);
354
+ auto endTime = std::chrono::steady_clock::now();
355
+ double latencyMs = std::chrono::duration<double, std::milli>(endTime - startTime).count();
356
+ lastStats_.totalTime = latencyMs / 1000.0;
357
+
358
+ // Update history
359
+ history_.push_back(Message{Role::USER, message});
360
+ history_.push_back(Message{Role::MODEL, result});
221
361
 
222
- return responseText;
362
+ return result;
223
363
  }
224
364
 
225
- //------------------------------------------------------------------------------
226
- // sendMessageWithImage - Multimodal image + text
227
- //------------------------------------------------------------------------------
228
- std::string HybridLiteRTLM::sendMessageWithImage(
365
+ // =============================================================================
366
+ // sendMessageAsync Streaming text inference
367
+ // =============================================================================
368
+
369
+ void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
370
+ bool is_final, const char* error_msg) {
371
+ auto* ctx = static_cast<StreamContext*>(callback_data);
372
+
373
+ if (error_msg) {
374
+ // Error occurred — notify JS and clean up
375
+ ctx->onToken(std::string("Error: ") + error_msg, true);
376
+ delete ctx;
377
+ return;
378
+ }
379
+
380
+ if (is_final) {
381
+ // Calculate stats
382
+ auto endTime = std::chrono::steady_clock::now();
383
+ double durationMs = std::chrono::duration<double, std::milli>(endTime - ctx->startTime).count();
384
+
385
+ if (ctx->lastStats && ctx->tokenCount > 0) {
386
+ ctx->lastStats->completionTokens = static_cast<double>(ctx->tokenCount);
387
+ ctx->lastStats->totalTime = durationMs / 1000.0;
388
+ ctx->lastStats->tokensPerSecond = (ctx->tokenCount / durationMs) * 1000.0;
389
+ }
390
+
391
+ // Update history (thread-safe)
392
+ {
393
+ std::lock_guard<std::mutex> lock(*ctx->historyMutex);
394
+ ctx->history->push_back(Message{Role::USER, ctx->userMessage});
395
+ ctx->history->push_back(Message{Role::MODEL, ctx->fullResponse});
396
+ }
397
+
398
+ ctx->onToken("", true);
399
+ delete ctx;
400
+ return;
401
+ }
402
+
403
+ if (chunk) {
404
+ std::string token(chunk);
405
+ ctx->fullResponse += token;
406
+ ctx->tokenCount++;
407
+ ctx->onToken(token, false);
408
+ }
409
+ }
410
+
411
+ void HybridLiteRTLM::sendMessageAsync(
229
412
  const std::string& message,
230
- const std::string& imagePath) {
413
+ const std::function<void(const std::string&, bool)>& onToken) {
414
+
415
+ // Copy values for the background thread (avoid use-after-free)
416
+ auto onTokenCopy = onToken;
417
+ auto messageCopy = message;
418
+
419
+ // Capture shared state safely
420
+ auto* ctx = new StreamContext();
421
+ ctx->onToken = std::move(onTokenCopy);
422
+ ctx->fullResponse = "";
423
+ ctx->history = &history_;
424
+ ctx->historyMutex = &mutex_;
425
+ ctx->userMessage = messageCopy;
426
+ ctx->lastStats = &lastStats_;
427
+ ctx->startTime = std::chrono::steady_clock::now();
428
+ ctx->tokenCount = 0;
231
429
 
232
- std::lock_guard<std::mutex> lock(mutex_);
430
+ #ifdef __APPLE__
233
431
  ensureLoaded();
234
432
 
235
- #ifdef LITERT_LM_ENABLED
236
- // Load image using stb_image
237
- int width, height, channels;
238
- unsigned char* img = stbi_load(imagePath.c_str(), &width, &height, &channels, 3); // Force 3 channels (RGB)
239
- if (img == nullptr) {
240
- throw std::runtime_error("Failed to load image from path: " + imagePath);
433
+ std::string msgJson = buildTextMessageJson(messageCopy);
434
+
435
+ int result = litert_lm_conversation_send_message_stream(
436
+ conversation_, msgJson.c_str(), nullptr,
437
+ streamCallbackFn, ctx);
438
+
439
+ if (result != 0) {
440
+ delete ctx;
441
+ throw std::runtime_error("LiteRT-LM: Failed to start streaming inference");
241
442
  }
443
+ #else
444
+ // Non-Apple stub
445
+ ctx->onToken("[iOS only] Streaming not available on this platform.", true);
446
+ delete ctx;
447
+ #endif
448
+ }
242
449
 
243
- // Create input tensor/buffer for the engine.
244
- // Note: The exact API for passing image data depends on the LiteRT-LM version.
245
- // Assuming a structure that accepts raw bytes and dimensions.
246
- litert::lm::UserMessage lm_message;
247
- lm_message.role = "user";
248
-
249
- // Construct multimodal content
250
- // Option A: If UserMessage supports a list of content parts
251
- litert::lm::ContentPart textPart;
252
- textPart.type = litert::lm::ContentType::TEXT;
253
- textPart.text = message;
254
- lm_message.parts.push_back(textPart);
450
+ // =============================================================================
451
+ // sendMessageWithImage Multimodal (vision)
452
+ // =============================================================================
255
453
 
256
- litert::lm::ContentPart imagePart;
257
- imagePart.type = litert::lm::ContentType::IMAGE;
258
- imagePart.image.width = width;
259
- imagePart.image.height = height;
260
- imagePart.image.channels = channels;
261
- imagePart.image.data = std::vector<uint8_t>(img, img + (width * height * channels));
262
- lm_message.parts.push_back(imagePart);
263
-
264
- stbi_image_free(img);
454
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithImage(
455
+ const std::string& message,
456
+ const std::string& imagePath) {
457
+ return Promise<std::string>::async([this, message, imagePath]() -> std::string {
458
+ return sendMessageWithImageInternal(message, imagePath);
459
+ });
460
+ }
265
461
 
266
- auto response = conversation_->SendMessage(lm_message);
267
- if (!response.ok()) {
268
- throw std::runtime_error("Multimodal inference failed: " +
269
- std::string(response.status().message()));
462
+ std::string HybridLiteRTLM::sendMessageWithImageInternal(
463
+ const std::string& message,
464
+ const std::string& imagePath) {
465
+
466
+ std::lock_guard<std::mutex> lock(mutex_);
467
+ ensureLoaded();
468
+
469
+ auto startTime = std::chrono::steady_clock::now();
470
+ std::string result;
471
+
472
+ #ifdef __APPLE__
473
+ // Verify image exists
474
+ std::ifstream imageFile(imagePath);
475
+ if (!imageFile.good()) {
476
+ throw std::runtime_error("Image file not found: " + imagePath);
270
477
  }
478
+ imageFile.close();
271
479
 
272
- // Add to history (metadata only)
273
- Message userMessage;
274
- userMessage.role = Role::USER;
275
- userMessage.content = message + " [Image]";
276
- history_.push_back(userMessage);
480
+ // Build multimodal message JSON — the C API handles image preprocessing
481
+ std::string msgJson = buildImageMessageJson(message, imagePath);
277
482
 
278
- Message modelMessage;
279
- modelMessage.role = Role::MODEL;
280
- modelMessage.content = response->content;
281
- history_.push_back(modelMessage);
483
+ auto* response = litert_lm_conversation_send_message(
484
+ conversation_, msgJson.c_str(), nullptr);
282
485
 
283
- return response->content;
486
+ if (!response) {
487
+ throw std::runtime_error("LiteRT-LM: sendMessageWithImage failed");
488
+ }
284
489
 
490
+ const char* responseStr = litert_lm_json_response_get_string(response);
491
+ if (responseStr) {
492
+ result = extractTextFromResponse(std::string(responseStr));
493
+ }
494
+ litert_lm_json_response_delete(response);
285
495
  #else
286
- // iOS: LiteRT-LM SDK not yet available, throw clear error
287
- throw std::runtime_error(
288
- "sendMessageWithImage is not supported on iOS. "
289
- "LiteRT-LM iOS SDK is not yet available. "
290
- "Please use text-only sendMessage() for now.");
496
+ result = "[iOS only] Vision inference not available on this platform.";
291
497
  #endif
498
+
499
+ auto endTime = std::chrono::steady_clock::now();
500
+ lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
501
+
502
+ history_.push_back(Message{Role::USER, message + " [image: " + imagePath + "]"});
503
+ history_.push_back(Message{Role::MODEL, result});
504
+
505
+ return result;
292
506
  }
293
507
 
294
- #endif
295
- }
508
+ // =============================================================================
509
+ // sendMessageWithAudio — Multimodal (audio)
510
+ // =============================================================================
296
511
 
297
- //------------------------------------------------------------------------------
298
- // downloadModel - Download model file from URL
299
- //------------------------------------------------------------------------------
300
- std::future<std::string> HybridLiteRTLM::downloadModel(
301
- const std::string& url,
302
- const std::string& fileName,
303
- const std::optional<std::function<void(double)>>& onProgress) {
304
-
305
- // Return a future that throws an exception
306
- return std::async(std::launch::async, []() -> std::string {
307
- throw std::runtime_error(
308
- "downloadModel is not supported on iOS yet. "
309
- "Please download the model manually using a separate library."
310
- );
512
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithAudio(
513
+ const std::string& message,
514
+ const std::string& audioPath) {
515
+ return Promise<std::string>::async([this, message, audioPath]() -> std::string {
516
+ return sendMessageWithAudioInternal(message, audioPath);
311
517
  });
312
518
  }
313
519
 
314
- //------------------------------------------------------------------------------
315
- // sendMessageWithAudio - Multimodal audio + text
316
- //------------------------------------------------------------------------------
317
- std::string HybridLiteRTLM::sendMessageWithAudio(
520
+ std::string HybridLiteRTLM::sendMessageWithAudioInternal(
318
521
  const std::string& message,
319
522
  const std::string& audioPath) {
320
523
 
321
524
  std::lock_guard<std::mutex> lock(mutex_);
322
525
  ensureLoaded();
323
526
 
324
- #ifdef LITERT_LM_ENABLED
325
- // Load audio file
326
- std::ifstream audioFile(audioPath, std::ios::binary);
327
- if (!audioFile) {
328
- throw std::runtime_error("Failed to open audio file: " + audioPath);
527
+ auto startTime = std::chrono::steady_clock::now();
528
+ std::string result;
529
+
530
+ #ifdef __APPLE__
531
+ std::ifstream audioFile(audioPath);
532
+ if (!audioFile.good()) {
533
+ throw std::runtime_error("Audio file not found: " + audioPath);
329
534
  }
535
+ audioFile.close();
330
536
 
331
- // Simple WAV header skip (simplistic, assuming standard header size for now or raw)
332
- // Ideally use a WAV parsing library or miniaudio if available.
333
- // For this implementation, we read the whole file.
334
- std::vector<uint8_t> audioData((std::istreambuf_iterator<char>(audioFile)), std::istreambuf_iterator<char>());
537
+ std::string msgJson = buildAudioMessageJson(message, audioPath);
335
538
 
336
- litert::lm::UserMessage lm_message;
337
- lm_message.role = "user";
539
+ auto* response = litert_lm_conversation_send_message(
540
+ conversation_, msgJson.c_str(), nullptr);
338
541
 
339
- litert::lm::ContentPart textPart;
340
- textPart.type = litert::lm::ContentType::TEXT;
341
- textPart.text = message;
342
- lm_message.parts.push_back(textPart);
343
-
344
- litert::lm::ContentPart audioPart;
345
- audioPart.type = litert::lm::ContentType::AUDIO;
346
- audioPart.audio.data = audioData;
347
- // Metadata like sample rate might be needed:
348
- // audioPart.audio.sample_rate = 16000;
349
- lm_message.parts.push_back(audioPart);
350
-
351
- auto response = conversation_->SendMessage(lm_message);
352
- if (!response.ok()) {
353
- throw std::runtime_error("Audio inference failed: " +
354
- std::string(response.status().message()));
542
+ if (!response) {
543
+ throw std::runtime_error("LiteRT-LM: sendMessageWithAudio failed");
355
544
  }
356
545
 
357
- Message userMessage;
358
- userMessage.role = Role::USER;
359
- userMessage.content = message + " [Audio]";
360
- history_.push_back(userMessage);
546
+ const char* responseStr = litert_lm_json_response_get_string(response);
547
+ if (responseStr) {
548
+ result = extractTextFromResponse(std::string(responseStr));
549
+ }
550
+ litert_lm_json_response_delete(response);
551
+ #else
552
+ result = "[iOS only] Audio inference not available on this platform.";
553
+ #endif
361
554
 
362
- Message modelMessage;
363
- modelMessage.role = Role::MODEL;
364
- modelMessage.content = response->content;
365
- history_.push_back(modelMessage);
555
+ auto endTime = std::chrono::steady_clock::now();
556
+ lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
366
557
 
367
- return response->content;
558
+ history_.push_back(Message{Role::USER, message + " [audio: " + audioPath + "]"});
559
+ history_.push_back(Message{Role::MODEL, result});
368
560
 
561
+ return result;
562
+ }
563
+
564
+ // =============================================================================
565
+ // downloadModel — Download model from URL
566
+ // =============================================================================
567
+
568
+ std::shared_ptr<Promise<std::string>> HybridLiteRTLM::downloadModel(
569
+ const std::string& url,
570
+ const std::string& fileName,
571
+ const std::optional<std::function<void(double)>>& onProgress) {
572
+ return Promise<std::string>::async([url, fileName, onProgress]() -> std::string {
573
+ #ifdef __APPLE__
574
+ return litert_lm::downloadModelFile(url, fileName, onProgress);
369
575
  #else
370
- // iOS: LiteRT-LM SDK not yet available, throw clear error
371
- throw std::runtime_error(
372
- "sendMessageWithAudio is not supported on iOS. "
373
- "LiteRT-LM iOS SDK is not yet available. "
374
- "Please use text-only sendMessage() for now.");
576
+ std::string destPath = "/tmp/" + fileName;
577
+ std::string curlCmd = "curl -L -o \"" + destPath + "\" \"" + url + "\"";
578
+ int result = system(curlCmd.c_str());
579
+ if (result != 0) {
580
+ throw std::runtime_error("Failed to download model from: " + url);
581
+ }
582
+ if (onProgress.has_value()) {
583
+ onProgress.value()(1.0);
584
+ }
585
+ return destPath;
375
586
  #endif
587
+ });
376
588
  }
377
589
 
378
- //------------------------------------------------------------------------------
379
- // sendMessageAsync - Streaming token generation
380
- //------------------------------------------------------------------------------
381
- void HybridLiteRTLM::sendMessageAsync(
382
- const std::string& message,
383
- const std::function<void(std::string, bool)>& onToken) {
384
-
385
- // Note: We don't hold the lock during the entire async operation
386
- // to avoid blocking other operations. The callback may be invoked
387
- // from a different thread depending on LiteRT-LM's implementation.
388
-
389
- {
390
- std::lock_guard<std::mutex> lock(mutex_);
391
- ensureLoaded();
392
- }
393
-
394
- #ifdef LITERT_LM_ENABLED
395
- // Add user message to history before starting
396
- {
397
- std::lock_guard<std::mutex> lock(mutex_);
398
- Message userMessage;
399
- userMessage.role = Role::USER;
400
- userMessage.content = message;
401
- history_.push_back(userMessage);
402
- }
403
-
404
- litert::lm::UserMessage lm_message;
405
- lm_message.role = "user";
406
- lm_message.content = message;
407
-
408
- std::string fullResponse;
409
-
410
- // The callback needs to be carefully managed for thread safety
411
- auto status = conversation_->SendMessageAsync(
412
- lm_message,
413
- [this, &onToken, &fullResponse](const std::string& token, bool isDone) {
414
- fullResponse += token;
415
-
416
- // Invoke the JS callback (Nitro handles thread marshalling)
417
- onToken(token, isDone);
418
-
419
- if (isDone) {
420
- // Add complete response to history
421
- std::lock_guard<std::mutex> lock(mutex_);
422
- Message modelMessage;
423
- modelMessage.role = Role::MODEL;
424
- modelMessage.content = fullResponse;
425
- history_.push_back(modelMessage);
426
- }
427
- }
428
- );
429
-
430
- if (!status.ok()) {
431
- // Remove user message since inference failed
432
- std::lock_guard<std::mutex> lock(mutex_);
433
- if (!history_.empty()) {
434
- history_.pop_back();
590
+ std::shared_ptr<Promise<void>> HybridLiteRTLM::deleteModel(const std::string& fileName) {
591
+ return Promise<void>::async([fileName]() {
592
+ std::string path;
593
+ #ifdef __APPLE__
594
+ // Match the path used by IOSDownloadHelper: ~/Library/Caches/litert_models/
595
+ const char* home = getenv("HOME");
596
+ if (home) {
597
+ path = std::string(home) + "/Library/Caches/litert_models/" + fileName;
435
598
  }
436
- throw std::runtime_error("Async inference failed: " +
437
- std::string(status.message()));
438
- }
439
-
440
599
  #else
441
- // Stub: Simulate streaming by calling sendMessage and splitting response
442
- std::string fullResponse;
443
- {
444
- std::lock_guard<std::mutex> lock(mutex_);
445
-
446
- // Add user message
447
- Message userMessage;
448
- userMessage.role = Role::USER;
449
- userMessage.content = message;
450
- history_.push_back(userMessage);
451
-
452
- fullResponse = "[LiteRT-LM Stub] Streaming response placeholder. You said: " + message;
453
- }
454
-
455
- // Simulate token-by-token streaming
456
- std::string currentWord;
457
- for (size_t i = 0; i < fullResponse.length(); i++) {
458
- char c = fullResponse[i];
459
- currentWord += c;
460
-
461
- if (c == ' ' || c == '\n' || i == fullResponse.length() - 1) {
462
- bool isDone = (i == fullResponse.length() - 1);
463
- onToken(currentWord, isDone);
464
- currentWord.clear();
465
- }
466
- }
467
-
468
- // Add model response to history
469
- {
470
- std::lock_guard<std::mutex> lock(mutex_);
471
- Message modelMessage;
472
- modelMessage.role = Role::MODEL;
473
- modelMessage.content = fullResponse;
474
- history_.push_back(modelMessage);
475
- }
600
+ path = "/tmp/" + fileName;
476
601
  #endif
602
+ if (!path.empty()) {
603
+ std::remove(path.c_str());
604
+ }
605
+ });
477
606
  }
478
607
 
479
- //------------------------------------------------------------------------------
480
- // getHistory - Return conversation history
481
- //------------------------------------------------------------------------------
608
+ // =============================================================================
609
+ // getHistory
610
+ // =============================================================================
611
+
482
612
  std::vector<Message> HybridLiteRTLM::getHistory() {
483
613
  std::lock_guard<std::mutex> lock(mutex_);
484
614
  return history_;
485
615
  }
486
616
 
487
- //------------------------------------------------------------------------------
488
- // resetConversation - Clear KV cache, keep engine
489
- //------------------------------------------------------------------------------
617
+ // =============================================================================
618
+ // resetConversation
619
+ // =============================================================================
620
+
490
621
  void HybridLiteRTLM::resetConversation() {
491
622
  std::lock_guard<std::mutex> lock(mutex_);
492
623
 
493
- #ifdef LITERT_LM_ENABLED
494
- // Destroy old conversation and create a new one
495
- // This clears the KV cache but keeps the (expensive) Engine loaded
496
- if (engine_) {
497
- conversation_.reset();
624
+ history_.clear();
625
+ lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
626
+
627
+ #ifdef __APPLE__
628
+ if (isLoaded_ && engine_) {
498
629
  createNewConversation();
499
630
  }
500
631
  #endif
501
-
502
- history_.clear();
503
632
  }
504
633
 
505
- //------------------------------------------------------------------------------
506
- // isReady - Check if model is loaded
507
- //------------------------------------------------------------------------------
634
+ // =============================================================================
635
+ // isReady
636
+ // =============================================================================
637
+
508
638
  bool HybridLiteRTLM::isReady() {
509
639
  std::lock_guard<std::mutex> lock(mutex_);
510
640
  return isLoaded_;
511
641
  }
512
642
 
513
- //------------------------------------------------------------------------------
514
- // getStats - Return last generation statistics
515
- //------------------------------------------------------------------------------
643
+ // =============================================================================
644
+ // getStats
645
+ // =============================================================================
646
+
516
647
  GenerationStats HybridLiteRTLM::getStats() {
517
648
  std::lock_guard<std::mutex> lock(mutex_);
518
649
  return lastStats_;
519
650
  }
520
651
 
521
- //------------------------------------------------------------------------------
522
- // getMemoryUsage - Return real memory usage from OS
523
- //------------------------------------------------------------------------------
652
+ // =============================================================================
653
+ // getMemoryUsage Uses Mach APIs for iOS process memory
654
+ // =============================================================================
655
+
524
656
  MemoryUsage HybridLiteRTLM::getMemoryUsage() {
525
- double nativeHeapBytes = 0;
526
- double residentBytes = 0;
527
- double availableMemoryBytes = 0;
657
+ double usedMemoryBytes = 0;
658
+ double totalMemoryBytes = 0;
659
+ double availableBytes = 0;
528
660
  bool isLowMemory = false;
529
-
661
+
530
662
  #ifdef __APPLE__
531
- // Get process memory info via Mach APIs
532
- struct mach_task_basic_info taskInfo;
533
- mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
534
- if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO,
535
- (task_info_t)&taskInfo, &infoCount) == KERN_SUCCESS) {
536
- residentBytes = static_cast<double>(taskInfo.resident_size);
537
- }
538
-
539
- // Get system-wide memory pressure
540
- vm_statistics64_data_t vmStats;
541
- mach_msg_type_number_t vmCount = HOST_VM_INFO64_COUNT;
542
- if (host_statistics64(mach_host_self(), HOST_VM_INFO64,
543
- (host_info64_t)&vmStats, &vmCount) == KERN_SUCCESS) {
544
- vm_size_t pageSize;
545
- host_page_size(mach_host_self(), &pageSize);
546
- availableMemoryBytes = static_cast<double>(vmStats.free_count) * pageSize;
547
- // Consider low memory if free pages < 10% of total active+inactive+free
548
- uint64_t totalPages = vmStats.active_count + vmStats.inactive_count + vmStats.free_count;
549
- isLowMemory = (totalPages > 0) &&
550
- (static_cast<double>(vmStats.free_count) / totalPages < 0.1);
551
- }
552
-
553
- // malloc_size is per-allocation; use resident_size as native heap proxy
554
- nativeHeapBytes = residentBytes;
555
- #endif
556
-
557
- #ifdef __ANDROID__
558
- // Parse /proc/self/status for VmRSS (resident set size)
559
- std::ifstream statusFile("/proc/self/status");
560
- if (statusFile.is_open()) {
561
- std::string line;
562
- while (std::getline(statusFile, line)) {
563
- if (line.rfind("VmRSS:", 0) == 0) {
564
- // Format: "VmRSS: 123456 kB"
565
- std::istringstream iss(line.substr(6));
566
- double kbValue = 0;
567
- iss >> kbValue;
568
- residentBytes = kbValue * 1024.0;
569
- break;
570
- }
571
- }
663
+ // Get app process memory (resident set size)
664
+ struct mach_task_basic_info info;
665
+ mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
666
+
667
+ kern_return_t kr = task_info(mach_task_self(),
668
+ MACH_TASK_BASIC_INFO,
669
+ (task_info_t)&info,
670
+ &count);
671
+
672
+ if (kr == KERN_SUCCESS) {
673
+ usedMemoryBytes = static_cast<double>(info.resident_size);
572
674
  }
573
-
574
- // Use mallinfo for native heap
575
- struct mallinfo mi = mallinfo();
576
- nativeHeapBytes = static_cast<double>(mi.uordblks); // total allocated space
577
-
578
- // Parse /proc/meminfo for available memory
579
- std::ifstream memFile("/proc/meminfo");
580
- if (memFile.is_open()) {
581
- std::string line;
582
- while (std::getline(memFile, line)) {
583
- if (line.rfind("MemAvailable:", 0) == 0) {
584
- std::istringstream iss(line.substr(13));
585
- double kbValue = 0;
586
- iss >> kbValue;
587
- availableMemoryBytes = kbValue * 1024.0;
588
- break;
589
- }
590
- }
675
+
676
+ // Get total physical memory
677
+ mach_port_t host_port = mach_host_self();
678
+ struct host_basic_info hostInfo;
679
+ mach_msg_type_number_t hostCount = HOST_BASIC_INFO_COUNT;
680
+
681
+ kr = host_info(host_port, HOST_BASIC_INFO,
682
+ (host_info_t)&hostInfo, &hostCount);
683
+
684
+ if (kr == KERN_SUCCESS) {
685
+ totalMemoryBytes = static_cast<double>(hostInfo.max_mem);
591
686
  }
592
-
593
- // Consider low if available < 256MB
594
- isLowMemory = availableMemoryBytes > 0 && availableMemoryBytes < 256.0 * 1024 * 1024;
687
+
688
+ availableBytes = totalMemoryBytes - usedMemoryBytes;
689
+ if (availableBytes < 0) availableBytes = 0;
690
+
691
+ // Low memory threshold (~200MB available)
692
+ isLowMemory = (totalMemoryBytes > 0) && (availableBytes < 200.0 * 1024.0 * 1024.0);
595
693
  #endif
596
-
597
- return MemoryUsage{nativeHeapBytes, residentBytes, availableMemoryBytes, isLowMemory};
694
+
695
+ return MemoryUsage{
696
+ usedMemoryBytes, // nativeHeapBytes
697
+ usedMemoryBytes, // residentBytes
698
+ availableBytes, // availableMemoryBytes
699
+ isLowMemory // isLowMemory
700
+ };
598
701
  }
599
702
 
600
- //------------------------------------------------------------------------------
601
- // close - Release all native resources
602
- //------------------------------------------------------------------------------
703
+ // =============================================================================
704
+ // close Clean up all LiteRT-LM resources
705
+ // =============================================================================
706
+
603
707
  void HybridLiteRTLM::close() {
604
- std::lock_guard<std::mutex> lock(mutex_);
605
-
606
- #ifdef LITERT_LM_ENABLED
607
- // Release in reverse order of creation
608
- conversation_.reset();
609
- engine_.reset();
610
- #endif
708
+ // Note: Don't lock here if called from destructor (mutex may be destroyed)
709
+ // The caller (loadModel, destructor) should handle locking.
611
710
 
612
711
  isLoaded_ = false;
613
712
  history_.clear();
713
+
714
+ #ifdef __APPLE__
715
+ if (conversation_) {
716
+ litert_lm_conversation_delete(conversation_);
717
+ conversation_ = nullptr;
718
+ }
719
+ if (conv_config_) {
720
+ litert_lm_conversation_config_delete(conv_config_);
721
+ conv_config_ = nullptr;
722
+ }
723
+ if (session_config_) {
724
+ litert_lm_session_config_delete(session_config_);
725
+ session_config_ = nullptr;
726
+ }
727
+ if (engine_) {
728
+ litert_lm_engine_delete(engine_);
729
+ engine_ = nullptr;
730
+ }
731
+ #endif
732
+
733
+ lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
614
734
  }
615
735
 
616
736
  } // namespace margelo::nitro::litertlm