npm - react-native-litert-lm - Versions diffs - 0.3.7 → 0.4.1 - Mend

react-native-litert-lm 0.3.7 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/README.md +153 -135
package/android/build.gradle +12 -0
package/android/src/main/AndroidManifest.xml +8 -0
package/android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt +276 -62
package/android/src/main/java/dev/litert/litertlm/LiteRTLMPackage.kt +19 -2
package/android/src/test/java/com/margelo/nitro/core/Promise.kt +46 -0
package/android/src/test/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLMTest.kt +105 -0
package/ios/HybridLiteRTLM.swift +1344 -0
package/ios/Tests/HybridLiteRTLMTests.swift +113 -0
package/lib/__mocks__/react-native-nitro-modules.d.ts +65 -0
package/lib/__mocks__/react-native-nitro-modules.js +60 -0
package/lib/__tests__/hooks.test.d.ts +1 -0
package/lib/__tests__/hooks.test.js +124 -0
package/lib/__tests__/memoryTracker.test.d.ts +1 -0
package/lib/__tests__/memoryTracker.test.js +74 -0
package/lib/__tests__/modelFactory.test.d.ts +1 -0
package/lib/__tests__/modelFactory.test.js +68 -0
package/lib/hooks.js +27 -3
package/lib/index.d.ts +6 -2
package/lib/index.js +8 -8
package/lib/modelFactory.js +82 -63
package/lib/specs/LiteRTLM.nitro.d.ts +87 -2
package/nitrogen/generated/android/LiteRTLMOnLoad.cpp +2 -2
package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.cpp +94 -9
package/nitrogen/generated/android/c++/JHybridLiteRTLMSpec.hpp +5 -1
package/nitrogen/generated/android/c++/JLLMConfig.hpp +40 -3
package/nitrogen/generated/android/c++/JMultimodalPart.hpp +74 -0
package/nitrogen/generated/android/c++/JPartType.hpp +61 -0
package/nitrogen/generated/android/c++/JToolDefinition.hpp +65 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/GenerationStats.kt +23 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLMSpec.kt +28 -2
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/LLMConfig.kt +46 -3
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/MemoryUsage.kt +19 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/Message.kt +15 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/MultimodalPart.kt +66 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/PartType.kt +24 -0
package/nitrogen/generated/android/kotlin/com/margelo/nitro/dev/litert/litertlm/ToolDefinition.kt +61 -0
package/nitrogen/generated/ios/LiteRTLM-Swift-Cxx-Bridge.cpp +57 -1
package/nitrogen/generated/ios/LiteRTLM-Swift-Cxx-Bridge.hpp +414 -3
package/nitrogen/generated/ios/LiteRTLM-Swift-Cxx-Umbrella.hpp +41 -3
package/nitrogen/generated/ios/LiteRTLMAutolinking.mm +4 -6
package/nitrogen/generated/ios/LiteRTLMAutolinking.swift +10 -0
package/nitrogen/generated/ios/c++/HybridLiteRTLMSpecSwift.cpp +11 -0
package/nitrogen/generated/ios/c++/HybridLiteRTLMSpecSwift.hpp +240 -0
package/nitrogen/generated/ios/swift/Backend.swift +44 -0
package/nitrogen/generated/ios/swift/Func_void.swift +46 -0
package/nitrogen/generated/ios/swift/Func_void_double.swift +46 -0
package/nitrogen/generated/ios/swift/Func_void_std__exception_ptr.swift +46 -0
package/nitrogen/generated/ios/swift/Func_void_std__string.swift +46 -0
package/nitrogen/generated/ios/swift/Func_void_std__string_bool.swift +46 -0
package/nitrogen/generated/ios/swift/GenerationStats.swift +54 -0
package/nitrogen/generated/ios/swift/HybridLiteRTLMSpec.swift +71 -0
package/nitrogen/generated/ios/swift/HybridLiteRTLMSpec_cxx.swift +431 -0
package/nitrogen/generated/ios/swift/LLMConfig.swift +203 -0
package/nitrogen/generated/ios/swift/MemoryUsage.swift +44 -0
package/nitrogen/generated/ios/swift/Message.swift +34 -0
package/nitrogen/generated/ios/swift/MultimodalPart.swift +83 -0
package/nitrogen/generated/ios/swift/PartType.swift +44 -0
package/nitrogen/generated/ios/swift/Role.swift +44 -0
package/nitrogen/generated/ios/swift/ToolDefinition.swift +39 -0
package/nitrogen/generated/shared/c++/HybridLiteRTLMSpec.cpp +4 -0
package/nitrogen/generated/shared/c++/HybridLiteRTLMSpec.hpp +9 -2
package/nitrogen/generated/shared/c++/LLMConfig.hpp +22 -2
package/nitrogen/generated/shared/c++/MultimodalPart.hpp +99 -0
package/nitrogen/generated/shared/c++/PartType.hpp +80 -0
package/nitrogen/generated/shared/c++/ToolDefinition.hpp +91 -0
package/package.json +22 -11
package/react-native-litert-lm.podspec +17 -19
package/scripts/download-ios-frameworks.sh +17 -50
package/scripts/framework-source.js +46 -0
package/scripts/postinstall.js +40 -18
package/src/__mocks__/react-native-nitro-modules.ts +58 -0
package/src/__tests__/hooks.test.ts +153 -0
package/src/__tests__/memoryTracker.test.ts +87 -0
package/src/__tests__/modelFactory.test.ts +96 -0
package/src/hooks.ts +29 -7
package/src/index.ts +7 -10
package/src/modelFactory.ts +104 -80
package/src/specs/LiteRTLM.nitro.ts +106 -2
package/cpp/HybridLiteRTLM.cpp +0 -939
package/cpp/HybridLiteRTLM.hpp +0 -169
package/cpp/IOSDownloadHelper.h +0 -24
package/ios/IOSDownloadHelper.mm +0 -129
package/scripts/build-ios-engine.sh +0 -302
package/scripts/stubs/cxx_bridge_stubs.cc +0 -224
package/scripts/stubs/gemma_model_constraint_provider.cc +0 -46
package/scripts/stubs/llguidance_stubs.c +0 -101
package/src/templates.ts +0 -105

package/cpp/HybridLiteRTLM.cpp DELETED Viewed

@@ -1,939 +0,0 @@
-//
-// HybridLiteRTLM.cpp
-// react-native-litert-lm
-//
-// High-performance LLM inference using LiteRT-LM C API.
-//
-// NOTE: This C++ implementation is used for iOS ONLY.
-// Android uses the Kotlin implementation in `android/src/main/java/com/margelo/nitro/dev/litert/litertlm/HybridLiteRTLM.kt`.
-// Do not assume changes here will affect Android.
-//
-#include "HybridLiteRTLM.hpp"
-#include <NitroModules/Promise.hpp>
-#include <chrono>
-#include <stdexcept>
-#include <sstream>
-#include <sys/stat.h>
-#include <cstdio>
-#ifdef __APPLE__
-#include "IOSDownloadHelper.h"
-#include <os/proc.h>
-#endif
-#include <fstream>
-#include <thread>
-#include <regex>
-#include <pthread.h>
-#include <functional>
-namespace margelo::nitro::litertlm {
-// =============================================================================
-// Thread Helper — LiteRT engine operations need >512KB stack (XNNPack, Metal)
-// =============================================================================
-static void runOnLargeStack(std::function<void()> work, size_t stackSize = 8 * 1024 * 1024) {
-  struct Context {
-    std::function<void()> fn;
-    std::exception_ptr exception;
-  };
-  Context ctx{std::move(work), nullptr};
-  pthread_t thread;
-  pthread_attr_t attr;
-  pthread_attr_init(&attr);
-  pthread_attr_setstacksize(&attr, stackSize);
-  int rc = pthread_create(&thread, &attr, [](void* arg) -> void* {
-    auto* c = static_cast<Context*>(arg);
-    try {
-      c->fn();
-    } catch (...) {
-      c->exception = std::current_exception();
-    }
-    return nullptr;
-  }, &ctx);
-  pthread_attr_destroy(&attr);
-  if (rc != 0) {
-    throw std::runtime_error("Failed to create large-stack thread (errno: " + std::to_string(rc) + ")");
-  }
-  pthread_join(thread, nullptr);
-  if (ctx.exception) {
-    std::rethrow_exception(ctx.exception);
-  }
-}
-// =============================================================================
-// JSON Helpers
-// =============================================================================
-std::string HybridLiteRTLM::escapeJson(const std::string& input) {
-  std::string output;
-  output.reserve(input.size() + 16);
-  for (char c : input) {
-    switch (c) {
-      case '"':  output += "\\\""; break;
-      case '\\': output += "\\\\"; break;
-      case '\n': output += "\\n"; break;
-      case '\r': output += "\\r"; break;
-      case '\t': output += "\\t"; break;
-      case '\b': output += "\\b"; break;
-      case '\f': output += "\\f"; break;
-      default:   output += c; break;
-    }
-  }
-  return output;
-}
-std::string HybridLiteRTLM::buildTextMessageJson(const std::string& text) {
-  return "{\"role\":\"user\",\"content\":\"" + escapeJson(text) + "\"}";
-}
-std::string HybridLiteRTLM::buildImageMessageJson(const std::string& text, const std::string& imagePath) {
-  return "{\"role\":\"user\",\"content\":["
-         "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
-         "{\"type\":\"image\",\"path\":\"" + escapeJson(imagePath) + "\"}"
-         "]}";
-}
-std::string HybridLiteRTLM::buildAudioMessageJson(const std::string& text, const std::string& audioPath) {
-  return "{\"role\":\"user\",\"content\":["
-         "{\"type\":\"text\",\"text\":\"" + escapeJson(text) + "\"},"
-         "{\"type\":\"audio\",\"path\":\"" + escapeJson(audioPath) + "\"}"
-         "]}";
-}
-/**
- * Gemma / LiteRT-LM control tokens that the iOS C API includes in raw output.
- * The Android Kotlin SDK strips these automatically.
- */
-static const char* kControlTokens[] = {
-  "<end_of_turn>",
-  "<start_of_turn>model",
-  "<start_of_turn>user",
-  "<start_of_turn>",
-  "<eos>",
-};
-/**
- * Strip control tokens from model output, preserving whitespace.
- * Streaming tokens like " the", " is" have meaningful leading spaces
- * that must not be trimmed.
- */
-static std::string stripControlTokens(const std::string& text) {
-  std::string result = text;
-  for (auto* tok : kControlTokens) {
-    std::string t(tok);
-    size_t pos;
-    while ((pos = result.find(t)) != std::string::npos) {
-      result.erase(pos, t.length());
-    }
-  }
-  return result;
-}
-/**
- * Determine how many characters from the start of `text` are safe to emit.
- * If the tail of `text` could be the beginning of a control token (split
- * across chunk boundaries), those characters are withheld until the next
- * chunk confirms whether it's a real token or normal content.
- */
-static size_t safeEmitLength(const std::string& text) {
-  // Find the last '<' — it could be the start of a partial control token
-  size_t lastAngle = text.rfind('<');
-  if (lastAngle == std::string::npos) {
-    return text.length();  // No '<' found, safe to emit all
-  }
-  std::string suffix = text.substr(lastAngle);
-  // Check if this suffix is a prefix of any control token
-  for (auto* tok : kControlTokens) {
-    std::string t(tok);
-    if (suffix.length() < t.length() && t.compare(0, suffix.length(), suffix) == 0) {
-      // This suffix could be the start of a control token — hold it back
-      return lastAngle;
-    }
-  }
-  // The '<' doesn't match any control token prefix, safe to emit all
-  return text.length();
-}
-/** Trim leading/trailing whitespace from a complete response. */
-static std::string trimWhitespace(const std::string& text) {
-  size_t start = text.find_first_not_of(" \t\n\r");
-  if (start == std::string::npos) return "";
-  size_t end = text.find_last_not_of(" \t\n\r");
-  return text.substr(start, end - start + 1);
-}
-std::string HybridLiteRTLM::extractTextFromResponse(const std::string& jsonResponse) {
-  // The C API response JSON is structured as:
-  //   {"role":"model","content":[{"type":"text","text":"..."}]}
-  // or:
-  //   {"role":"model","content":"..."}
-  //
-  // We use simple string extraction to avoid a JSON library dependency.
-  // Try array format first: find "text":"..." after "type":"text"
-  std::string textMarker = "\"text\":\"";
-  size_t pos = jsonResponse.find("\"type\":\"text\"");
-  if (pos != std::string::npos) {
-    pos = jsonResponse.find(textMarker, pos);
-    if (pos != std::string::npos) {
-      pos += textMarker.length();
-      std::string result;
-      result.reserve(jsonResponse.size() - pos);
-      for (size_t i = pos; i < jsonResponse.size(); i++) {
-        if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
-          char next = jsonResponse[i + 1];
-          if (next == '"') { result += '"'; i++; }
-          else if (next == '\\') { result += '\\'; i++; }
-          else if (next == 'n') { result += '\n'; i++; }
-          else if (next == 'r') { result += '\r'; i++; }
-          else if (next == 't') { result += '\t'; i++; }
-          else { result += jsonResponse[i]; }
-        } else if (jsonResponse[i] == '"') {
-          break;  // End of the text value
-        } else {
-          result += jsonResponse[i];
-        }
-      }
-      return stripControlTokens(result);
-    }
-  }
-  // Try simple string format: "content":"..."
-  std::string contentMarker = "\"content\":\"";
-  pos = jsonResponse.find(contentMarker);
-  if (pos != std::string::npos) {
-    pos += contentMarker.length();
-    std::string result;
-    for (size_t i = pos; i < jsonResponse.size(); i++) {
-      if (jsonResponse[i] == '\\' && i + 1 < jsonResponse.size()) {
-        char next = jsonResponse[i + 1];
-        if (next == '"') { result += '"'; i++; }
-        else if (next == '\\') { result += '\\'; i++; }
-        else if (next == 'n') { result += '\n'; i++; }
-        else { result += jsonResponse[i]; }
-      } else if (jsonResponse[i] == '"') {
-        break;
-      } else {
-        result += jsonResponse[i];
-      }
-    }
-    return stripControlTokens(result);
-  }
-  // Fallback: return full response (still strip control tokens)
-  return stripControlTokens(jsonResponse);
-}
-// =============================================================================
-// Conversation Management
-// =============================================================================
-void HybridLiteRTLM::createNewConversation() {
-#ifdef __APPLE__
-  if (!engine_) {
-    throw std::runtime_error("Cannot create conversation: engine not initialized");
-  }
-  // Clean up previous conversation
-  if (conversation_) {
-    litert_lm_conversation_delete(conversation_);
-    conversation_ = nullptr;
-  }
-  if (conv_config_) {
-    litert_lm_conversation_config_delete(conv_config_);
-    conv_config_ = nullptr;
-  }
-  // Build system message JSON if provided
-  std::string systemMsgJson;
-  const char* systemMsgPtr = nullptr;
-  if (!systemPrompt_.empty()) {
-    systemMsgJson = "{\"role\":\"system\",\"content\":\"" + escapeJson(systemPrompt_) + "\"}";
-    systemMsgPtr = systemMsgJson.c_str();
-  }
-  // Create conversation config with session config
-  conv_config_ = litert_lm_conversation_config_create(
-    engine_,
-    session_config_,  // may be nullptr for defaults
-    systemMsgPtr,     // system message
-    nullptr,          // tools (not used yet)
-    nullptr,          // messages history
-    false             // constrained decoding
-  );
-  if (!conv_config_) {
-    throw std::runtime_error("Failed to create conversation config");
-  }
-  // Create conversation
-  conversation_ = litert_lm_conversation_create(engine_, conv_config_);
-  if (!conversation_) {
-    litert_lm_conversation_config_delete(conv_config_);
-    conv_config_ = nullptr;
-    throw std::runtime_error("Failed to create conversation");
-  }
-#endif
-}
-// =============================================================================
-// loadModel
-// =============================================================================
-std::shared_ptr<Promise<void>> HybridLiteRTLM::loadModel(
-    const std::string& modelPath,
-    const std::optional<LLMConfig>& config) {
-  return Promise<void>::async([this, modelPath, config]() {
-    runOnLargeStack([&]() {
-      loadModelInternal(modelPath, config);
-    });
-  });
-}
-void HybridLiteRTLM::loadModelInternal(
-    const std::string& modelPath,
-    const std::optional<LLMConfig>& config) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  if (isLoaded_) {
-    close();
-  }
-  if (config.has_value()) {
-    if (config->backend.has_value()) {
-      backend_ = config->backend.value();
-    }
-    if (config->temperature.has_value()) {
-      temperature_ = config->temperature.value();
-    }
-    if (config->topK.has_value()) {
-      topK_ = config->topK.value();
-    }
-    if (config->topP.has_value()) {
-      topP_ = config->topP.value();
-    }
-    if (config->maxTokens.has_value()) {
-      maxTokens_ = config->maxTokens.value();
-    }
-    if (config->systemPrompt.has_value()) {
-      systemPrompt_ = config->systemPrompt.value();
-    }
-  }
-#ifdef __APPLE__
-  // Set log verbosity: 2=WARNING (production), 0=INFO (debug)
-  litert_lm_set_min_log_level(2);
-  auto backendStr = [](Backend b) -> const char* {
-    switch (b) {
-      case Backend::GPU: return "gpu";
-      case Backend::NPU: return "gpu"; // NPU not available on iOS, use GPU
-      default: return "cpu";
-    }
-  };
-  auto tryCreateEngine = [&](const char* backend, const char* visionBackend) -> bool {
-    auto* settings = litert_lm_engine_settings_create(
-      modelPath.c_str(),
-      backend,
-      visionBackend,
-      "cpu" // audio executor: iOS XCFramework lacks compiled audio ops (INTERNAL ERROR at Invoke)
-    );
-    if (!settings) {
-      return false;
-    }
-    litert_lm_engine_settings_set_max_num_tokens(settings, static_cast<int>(maxTokens_));
-    litert_lm_engine_settings_enable_benchmark(settings);
-    // Set cache directory to the same directory as the model file
-    std::string cacheDir = modelPath.substr(0, modelPath.find_last_of('/'));
-    litert_lm_engine_settings_set_cache_dir(settings, cacheDir.c_str());
-    engine_ = litert_lm_engine_create(settings);
-    litert_lm_engine_settings_delete(settings);
-    return engine_ != nullptr;
-  };
-  // Try requested backend first (e.g. gpu/gpu)
-  const char* primaryBackend = backendStr(backend_);
-  if (!tryCreateEngine(primaryBackend, primaryBackend)) {
-    // Fallback chain for when the primary backend fails:
-    bool fallbackOk = false;
-    if (backend_ != Backend::CPU) {
-      // 1) Try CPU main + GPU vision (model's vision encoder often requires GPU)
-      fallbackOk = tryCreateEngine("cpu", "gpu");
-      // 2) Try CPU main + CPU vision
-      if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", "cpu");
-    }
-    // 3) Try CPU main + no vision (nullptr skips vision executor entirely)
-    if (!fallbackOk) fallbackOk = tryCreateEngine("cpu", nullptr);
-    if (fallbackOk) {
-      backend_ = Backend::CPU;
-    }
-  }
-  if (!engine_) {
-    // Collect diagnostic info
-    std::string diag = " | Diagnostics: ";
-    struct stat st;
-    if (stat(modelPath.c_str(), &st) == 0) {
-      diag += "File size: " + std::to_string(st.st_size) + " bytes";
-    } else {
-      diag += "Failed to stat file (errno: " + std::to_string(errno) + ")";
-    }
-    FILE* f = fopen(modelPath.c_str(), "rb");
-    if (f) {
-      diag += ", Readable: YES";
-      fclose(f);
-    } else {
-      diag += ", Readable: NO (errno: " + std::to_string(errno) + ")";
-    }
-    throw std::runtime_error(
-      "Failed to create LiteRT-LM engine. Tried backend '" +
-      std::string(primaryBackend) + "' and CPU fallback. Model path: " + modelPath + diag);
-  }
-  session_config_ = litert_lm_session_config_create();
-  if (session_config_) {
-    litert_lm_session_config_set_max_output_tokens(session_config_, static_cast<int>(maxTokens_));
-    LiteRtLmSamplerParams sampler{};
-    sampler.type = kTopP;
-    sampler.top_k = static_cast<int32_t>(topK_);
-    sampler.top_p = static_cast<float>(topP_);
-    sampler.temperature = static_cast<float>(temperature_);
-    sampler.seed = 0;
-    litert_lm_session_config_set_sampler_params(session_config_, &sampler);
-  }
-  createNewConversation();
-#endif
-  isLoaded_ = true;
-  history_.clear();
-  lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-// =============================================================================
-// sendMessage — Blocking text inference
-// =============================================================================
-std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessage(const std::string& message) {
-  return Promise<std::string>::async([this, message]() -> std::string {
-    std::string result;
-    runOnLargeStack([&]() {
-      result = sendMessageInternal(message);
-    });
-    return result;
-  });
-}
-std::string HybridLiteRTLM::sendMessageInternal(const std::string& message) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  ensureLoaded();
-  auto startTime = std::chrono::steady_clock::now();
-  std::string result;
-#ifdef __APPLE__
-  std::string msgJson = buildTextMessageJson(message);
-  auto* response = litert_lm_conversation_send_message(
-    conversation_, msgJson.c_str(), nullptr);
-  if (!response) {
-    throw std::runtime_error("LiteRT-LM: sendMessage failed");
-  }
-  const char* responseStr = litert_lm_json_response_get_string(response);
-  if (responseStr) {
-    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
-  }
-  litert_lm_json_response_delete(response);
-  auto* benchInfo = litert_lm_conversation_get_benchmark_info(conversation_);
-  if (benchInfo) {
-    int numDecodeTurns = litert_lm_benchmark_info_get_num_decode_turns(benchInfo);
-    if (numDecodeTurns > 0) {
-      int lastIdx = numDecodeTurns - 1;
-      lastStats_.tokensPerSecond = litert_lm_benchmark_info_get_decode_tokens_per_sec_at(benchInfo, lastIdx);
-      lastStats_.completionTokens = static_cast<double>(
-        litert_lm_benchmark_info_get_decode_token_count_at(benchInfo, lastIdx));
-    }
-    lastStats_.timeToFirstToken = litert_lm_benchmark_info_get_time_to_first_token(benchInfo);
-    litert_lm_benchmark_info_delete(benchInfo);
-  }
-#else
-  // Non-Apple stub
-  result = "[iOS only] LiteRT-LM inference not available on this platform.";
-#endif
-  auto endTime = std::chrono::steady_clock::now();
-  double latencyMs = std::chrono::duration<double, std::milli>(endTime - startTime).count();
-  lastStats_.totalTime = latencyMs / 1000.0;
-  // Update history
-  history_.push_back(Message{Role::USER, message});
-  history_.push_back(Message{Role::MODEL, result});
-  return result;
-}
-// =============================================================================
-// sendMessageAsync — Streaming text inference
-// =============================================================================
-void HybridLiteRTLM::streamCallbackFn(void* callback_data, const char* chunk,
-                                        bool is_final, const char* error_msg) {
-  auto* ctx = static_cast<StreamContext*>(callback_data);
-  if (error_msg) {
-    // Error occurred — notify JS and clean up
-    ctx->onToken(std::string("Error: ") + error_msg, true);
-    delete ctx;
-    return;
-  }
-  if (is_final) {
-    // Calculate stats
-    auto endTime = std::chrono::steady_clock::now();
-    double durationMs = std::chrono::duration<double, std::milli>(endTime - ctx->startTime).count();
-    if (ctx->lastStats && ctx->tokenCount > 0) {
-      ctx->lastStats->completionTokens = static_cast<double>(ctx->tokenCount);
-      ctx->lastStats->totalTime = durationMs / 1000.0;
-      ctx->lastStats->tokensPerSecond = (ctx->tokenCount / durationMs) * 1000.0;
-    }
-    // Final flush: do one last clean of the full accumulated response
-    // to emit any text that was withheld by safeEmitLength.
-    std::string cleaned = stripControlTokens(ctx->rawResponse);
-    size_t start = cleaned.find_first_not_of(" \t\n\r");
-    if (start != std::string::npos) {
-      cleaned = cleaned.substr(start);
-      // Strip echoed user message
-      if (!ctx->userMessage.empty() && cleaned.find(ctx->userMessage) == 0) {
-        cleaned = cleaned.substr(ctx->userMessage.length());
-        size_t nextStart = cleaned.find_first_not_of(" \t\n\r");
-        cleaned = (nextStart != std::string::npos) ? cleaned.substr(nextStart) : "";
-      }
-      // Emit any remaining text not yet sent
-      if (cleaned.length() > ctx->lastEmittedLength) {
-        std::string remaining = cleaned.substr(ctx->lastEmittedLength);
-        ctx->onToken(remaining, false);
-      }
-      ctx->fullResponse = cleaned;
-    }
-    // Update history (thread-safe)
-    {
-      std::lock_guard<std::mutex> lock(*ctx->historyMutex);
-      ctx->history->push_back(Message{Role::USER, ctx->userMessage});
-      ctx->history->push_back(Message{Role::MODEL, ctx->fullResponse});
-    }
-    ctx->onToken("", true);
-    delete ctx;
-    return;
-  }
-  if (chunk) {
-    std::string token(chunk);
-    // The C API may return JSON-wrapped responses (e.g.
-    // {"role":"model","content":[{"type":"text","text":"Hi"}]})
-    // instead of raw text tokens. Detect and extract text content.
-    std::string raw;
-    if (token.size() > 2 && token[0] == '{' && token.find("\"role\"") != std::string::npos) {
-      raw = HybridLiteRTLM::extractTextFromResponse(token);
-    } else {
-      raw = token;
-    }
-    // Accumulate raw text, then strip control tokens from the FULL buffer.
-    // This correctly handles tokens split across chunk boundaries (e.g.
-    // chunk1="<end_of_tu" chunk2="rn>Hello").
-    ctx->rawResponse += raw;
-    std::string cleaned = stripControlTokens(ctx->rawResponse);
-    // Trim leading whitespace from the overall response
-    size_t start = cleaned.find_first_not_of(" \t\n\r");
-    if (start == std::string::npos) {
-      // Still only whitespace/control tokens — nothing to emit yet
-      return;
-    }
-    cleaned = cleaned.substr(start);
-    // The C API may echo back the user's message before the model response.
-    // Strip the echoed user message prefix if present.
-    if (!ctx->userMessage.empty()) {
-      size_t userPos = cleaned.find(ctx->userMessage);
-      if (userPos == 0) {
-        cleaned = cleaned.substr(ctx->userMessage.length());
-        // Trim any whitespace after the stripped user message
-        size_t nextStart = cleaned.find_first_not_of(" \t\n\r");
-        if (nextStart == std::string::npos) {
-          return;  // Only user message so far, nothing to emit
-        }
-        cleaned = cleaned.substr(nextStart);
-      }
-    }
-    // Only emit text that is "safe" — withhold any trailing characters
-    // that could be the start of a control token split across chunks.
-    size_t safe = safeEmitLength(cleaned);
-    if (safe > ctx->lastEmittedLength) {
-      std::string newText = cleaned.substr(ctx->lastEmittedLength, safe - ctx->lastEmittedLength);
-      ctx->fullResponse = cleaned.substr(0, safe);
-      ctx->lastEmittedLength = safe;
-      ctx->tokenCount++;
-      ctx->onToken(newText, false);
-    }
-  }
-}
-void HybridLiteRTLM::sendMessageAsync(
-    const std::string& message,
-    const std::function<void(const std::string&, bool)>& onToken) {
-  // Copy values for the background thread (avoid use-after-free)
-  auto onTokenCopy = onToken;
-  auto messageCopy = message;
-  // Capture shared state safely — use unique_ptr to prevent leaks
-  auto ctxOwner = std::make_unique<StreamContext>();
-  ctxOwner->onToken = std::move(onTokenCopy);
-  ctxOwner->rawResponse = "";
-  ctxOwner->fullResponse = "";
-  ctxOwner->lastEmittedLength = 0;
-  ctxOwner->history = &history_;
-  ctxOwner->historyMutex = &mutex_;
-  ctxOwner->userMessage = messageCopy;
-  ctxOwner->lastStats = &lastStats_;
-  ctxOwner->startTime = std::chrono::steady_clock::now();
-  ctxOwner->tokenCount = 0;
-#ifdef __APPLE__
-  ensureLoaded();
-  std::string msgJson = buildTextMessageJson(messageCopy);
-  // Release ownership — the C callback now owns the context via raw pointer.
-  // streamCallbackFn will delete it when done or on error.
-  StreamContext* ctx = ctxOwner.release();
-  // Wrap the initial engine call in runOnLargeStack for consistency
-  // with all other engine entry points (XNNPack needs >512KB stack).
-  runOnLargeStack([&]() {
-    int result = litert_lm_conversation_send_message_stream(
-      conversation_, msgJson.c_str(), nullptr,
-      streamCallbackFn, ctx);
-    if (result != 0) {
-      delete ctx;
-      throw std::runtime_error("LiteRT-LM: Failed to start streaming inference");
-    }
-  });
-#else
-  // Non-Apple stub
-  ctxOwner->onToken("[iOS only] Streaming not available on this platform.", true);
-  // ctxOwner auto-deleted by unique_ptr
-#endif
-}
-// =============================================================================
-// sendMessageWithImage — Multimodal (vision)
-// =============================================================================
-std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithImage(
-    const std::string& message,
-    const std::string& imagePath) {
-  return Promise<std::string>::async([this, message, imagePath]() -> std::string {
-    std::string result;
-    runOnLargeStack([&]() {
-      result = sendMessageWithImageInternal(message, imagePath);
-    });
-    return result;
-  });
-}
-std::string HybridLiteRTLM::sendMessageWithImageInternal(
-    const std::string& message,
-    const std::string& imagePath) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  ensureLoaded();
-  auto startTime = std::chrono::steady_clock::now();
-  std::string result;
-#ifdef __APPLE__
-  // Verify image exists
-  std::ifstream imageFile(imagePath);
-  if (!imageFile.good()) {
-    throw std::runtime_error("Image file not found: " + imagePath);
-  }
-  imageFile.close();
-  // Build multimodal message JSON — the C API handles image preprocessing
-  std::string msgJson = buildImageMessageJson(message, imagePath);
-  auto* response = litert_lm_conversation_send_message(
-    conversation_, msgJson.c_str(), nullptr);
-  if (!response) {
-    throw std::runtime_error("LiteRT-LM: sendMessageWithImage failed");
-  }
-  const char* responseStr = litert_lm_json_response_get_string(response);
-  if (responseStr) {
-    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
-  }
-  litert_lm_json_response_delete(response);
-#else
-  result = "[iOS only] Vision inference not available on this platform.";
-#endif
-  auto endTime = std::chrono::steady_clock::now();
-  lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
-  history_.push_back(Message{Role::USER, message + " [image: " + imagePath + "]"});
-  history_.push_back(Message{Role::MODEL, result});
-  return result;
-}
-// =============================================================================
-// sendMessageWithAudio — Multimodal (audio)
-// =============================================================================
-std::shared_ptr<Promise<std::string>> HybridLiteRTLM::sendMessageWithAudio(
-    const std::string& message,
-    const std::string& audioPath) {
-  return Promise<std::string>::async([this, message, audioPath]() -> std::string {
-    std::string result;
-    runOnLargeStack([&]() {
-      result = sendMessageWithAudioInternal(message, audioPath);
-    });
-    return result;
-  });
-}
-std::string HybridLiteRTLM::sendMessageWithAudioInternal(
-    const std::string& message,
-    const std::string& audioPath) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  ensureLoaded();
-  auto startTime = std::chrono::steady_clock::now();
-  std::string result;
-#ifdef __APPLE__
-  std::ifstream audioFile(audioPath);
-  if (!audioFile.good()) {
-    throw std::runtime_error("Audio file not found: " + audioPath);
-  }
-  audioFile.close();
-  std::string msgJson = buildAudioMessageJson(message, audioPath);
-  auto* response = litert_lm_conversation_send_message(
-    conversation_, msgJson.c_str(), nullptr);
-  if (!response) {
-    throw std::runtime_error("LiteRT-LM: sendMessageWithAudio failed");
-  }
-  const char* responseStr = litert_lm_json_response_get_string(response);
-  if (responseStr) {
-    result = trimWhitespace(extractTextFromResponse(std::string(responseStr)));
-  }
-  litert_lm_json_response_delete(response);
-#else
-  result = "[iOS only] Audio inference not available on this platform.";
-#endif
-  auto endTime = std::chrono::steady_clock::now();
-  lastStats_.totalTime = std::chrono::duration<double>(endTime - startTime).count();
-  history_.push_back(Message{Role::USER, message + " [audio: " + audioPath + "]"});
-  history_.push_back(Message{Role::MODEL, result});
-  return result;
-}
-// =============================================================================
-// downloadModel — Download model from URL
-// =============================================================================
-std::shared_ptr<Promise<std::string>> HybridLiteRTLM::downloadModel(
-    const std::string& url,
-    const std::string& fileName,
-    const std::optional<std::function<void(double)>>& onProgress) {
-  return Promise<std::string>::async([url, fileName, onProgress]() -> std::string {
-#ifdef __APPLE__
-    return litert_lm::downloadModelFile(url, fileName, onProgress);
-#else
-    // Non-Apple platforms: not supported from C++ (Android uses Kotlin)
-    throw std::runtime_error("Download not available on this platform. Use the Kotlin implementation.");
-#endif
-  });
-}
-std::shared_ptr<Promise<void>> HybridLiteRTLM::deleteModel(const std::string& fileName) {
-  return Promise<void>::async([fileName]() {
-    std::string path;
-#ifdef __APPLE__
-    // Match the path used by IOSDownloadHelper: ~/Library/Caches/litert_models/
-    const char* home = getenv("HOME");
-    if (home) {
-      path = std::string(home) + "/Library/Caches/litert_models/" + fileName;
-    }
-#else
-    path = "/tmp/" + fileName;
-#endif
-    if (!path.empty()) {
-      std::remove(path.c_str());
-    }
-  });
-}
-// =============================================================================
-// getHistory
-// =============================================================================
-std::vector<Message> HybridLiteRTLM::getHistory() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return history_;
-}
-// =============================================================================
-// resetConversation
-// =============================================================================
-void HybridLiteRTLM::resetConversation() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  history_.clear();
-  lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-#ifdef __APPLE__
-  if (isLoaded_ && engine_) {
-    createNewConversation();
-  }
-#endif
-}
-// =============================================================================
-// isReady
-// =============================================================================
-bool HybridLiteRTLM::isReady() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return isLoaded_;
-}
-// =============================================================================
-// getStats
-// =============================================================================
-GenerationStats HybridLiteRTLM::getStats() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  return lastStats_;
-}
-// =============================================================================
-// getMemoryUsage — Uses Mach APIs for iOS process memory
-// =============================================================================
-MemoryUsage HybridLiteRTLM::getMemoryUsage() {
-  double nativeHeapBytes = 0;
-  double residentBytes = 0;
-  double availableBytes = 0;
-  bool isLowMemory = false;
-#ifdef __APPLE__
-  // Get app process memory (resident set size)
-  struct mach_task_basic_info info;
-  mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
-  kern_return_t kr = task_info(mach_task_self(),
-                               MACH_TASK_BASIC_INFO,
-                               (task_info_t)&info,
-                               &count);
-  if (kr == KERN_SUCCESS) {
-    residentBytes = static_cast<double>(info.resident_size);
-    // On iOS, mach_task_basic_info doesn't separate heap from RSS.
-    // Use resident_size_max as a proxy for peak native allocation.
-    nativeHeapBytes = static_cast<double>(info.resident_size);
-  }
-  // Use os_proc_available_memory() (iOS 13+) for accurate Jetsam headroom.
-  // This reports how much memory the process can still allocate before
-  // the system kills it — far more accurate than total_physical - process_rss.
-  availableBytes = static_cast<double>(os_proc_available_memory());
-  // Low memory threshold (~200MB available)
-  isLowMemory = availableBytes < 200.0 * 1024.0 * 1024.0;
-#endif
-  return MemoryUsage{
-    nativeHeapBytes,            // nativeHeapBytes (RSS as proxy on iOS)
-    residentBytes,              // residentBytes
-    availableBytes,             // availableMemoryBytes
-    isLowMemory                 // isLowMemory
-  };
-}
-// =============================================================================
-// close — Clean up all LiteRT-LM resources
-// =============================================================================
-void HybridLiteRTLM::close() {
-  // Note: Don't lock here if called from destructor (mutex may be destroyed)
-  // The caller (loadModel, destructor) should handle locking.
-  isLoaded_ = false;
-  history_.clear();
-#ifdef __APPLE__
-  if (conversation_) {
-    litert_lm_conversation_delete(conversation_);
-    conversation_ = nullptr;
-  }
-  if (conv_config_) {
-    litert_lm_conversation_config_delete(conv_config_);
-    conv_config_ = nullptr;
-  }
-  if (session_config_) {
-    litert_lm_session_config_delete(session_config_);
-    session_config_ = nullptr;
-  }
-  if (engine_) {
-    litert_lm_engine_delete(engine_);
-    engine_ = nullptr;
-  }
-#endif
-  lastStats_ = GenerationStats{0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-} // namespace margelo::nitro::litertlm