npm - @novastera-oss/llamarn - Versions diffs - 0.2.2 → 0.2.4 - Mend

@novastera-oss/llamarn 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/cpp/PureCppImpl.cpp CHANGED Viewed

@@ -43,285 +43,389 @@ double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
 }
 jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String modelPath) {
+  // Parse JSI arguments to native types on JSI thread
   std::string path = modelPath.utf8(runtime);
   SystemUtils::normalizeFilePath(path);
-  try {
-    // Initialize llama backend
-    llama_backend_init();
-    // Create model params
-    llama_model_params params = llama_model_default_params();
-    params.n_gpu_layers = 0; // Use CPU for model info loading
-    // Load the model
-    llama_model* model = llama_model_load_from_file(path.c_str(), params);
-    if (!model) {
-      throw std::runtime_error("Failed to load model from file: " + path);
-    }
-    // Create result object
-    jsi::Object result(runtime);
+  if (!jsInvoker_) {
+    // Fallback to synchronous if no CallInvoker available - this should not happen normally
+    throw jsi::JSError(runtime, "CallInvoker not available for async operation");
+  }
-    // Get model parameters
-    result.setProperty(runtime, "n_params", jsi::Value((double)llama_model_n_params(model)));
+  // Create Promise constructor
+  auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
+  auto executor = jsi::Function::createFromHostFunction(
+    runtime,
+    jsi::PropNameID::forAscii(runtime, "executor"),
+    2,
+    [this, path](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
+      auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
+      auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
+      // Create shared references to runtime and invoker for thread safety
+      auto runtimePtr = &runtime;
+      auto invoker = jsInvoker_;
+      auto selfPtr = shared_from_this();
+      // Launch background thread for model info loading
+      std::thread([selfPtr, path, resolve, reject, runtimePtr, invoker]() {
+        try {
+          // Initialize llama backend
+          llama_backend_init();
-    // Get vocabulary
-    const llama_vocab* vocab = llama_model_get_vocab(model);
-    result.setProperty(runtime, "n_vocab", jsi::Value((double)llama_vocab_n_tokens(vocab)));
+          // Create model params
+          llama_model_params params = llama_model_default_params();
+          params.n_gpu_layers = 0; // Use CPU for model info loading
-    // Get context size
-    result.setProperty(runtime, "n_context", jsi::Value((double)llama_model_n_ctx_train(model)));
+          // Load the model
+          llama_model* model = llama_model_load_from_file(path.c_str(), params);
-    // Get embedding size
-    result.setProperty(runtime, "n_embd", jsi::Value((double)llama_model_n_embd(model)));
+          if (!model) {
+            throw std::runtime_error("Failed to load model from file: " + path);
+          }
-    // Get model description
-    char buf[512];
-    llama_model_desc(model, buf, sizeof(buf));
-    result.setProperty(runtime, "description",
-                      jsi::String::createFromUtf8(runtime, buf[0] ? buf : "Unknown model"));
+          // Get model information (native types)
+          double n_params = (double)llama_model_n_params(model);
+          const llama_vocab* vocab = llama_model_get_vocab(model);
+          double n_vocab = (double)llama_vocab_n_tokens(vocab);
+          double n_context = (double)llama_model_n_ctx_train(model);
+          double n_embd = (double)llama_model_n_embd(model);
+          // Get model description
+          char buf[512];
+          llama_model_desc(model, buf, sizeof(buf));
+          std::string description = buf[0] ? buf : "Unknown model";
+          // Check if GPU is supported
+          bool gpuSupported = llama_supports_gpu_offload();
+          // Calculate optimal GPU layers if GPU is supported
+          int optimalGpuLayers = 0;
+          if (gpuSupported) {
+            optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
+          }
-    // Check if GPU is supported
-    bool gpuSupported = llama_supports_gpu_offload();
-    result.setProperty(runtime, "gpuSupported", jsi::Value(gpuSupported));
+          // Extract quantization type from model description
+          std::string desc(buf);
+          std::string quantType = "Unknown";
+          size_t qPos = desc.find(" Q");
+          if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
+            // Extract quantization string (like Q4_K, Q5_K, Q8_0)
+            quantType = desc.substr(qPos + 1, 4);
+            // Remove any trailing non-alphanumeric characters
+            quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
+              return std::isalnum(c);
+            }).base(), quantType.end());
+          }
-    // Calculate optimal GPU layers if GPU is supported
-    int optimalGpuLayers = 0;
-    if (gpuSupported) {
-      optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
-    }
-    result.setProperty(runtime, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
-    // Extract quantization type from model description
-    std::string desc(buf);
-    std::string quantType = "Unknown";
-    size_t qPos = desc.find(" Q");
-    if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
-      // Extract quantization string (like Q4_K, Q5_K, Q8_0)
-      quantType = desc.substr(qPos + 1, 4);
-      // Remove any trailing non-alphanumeric characters
-      quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
-        return std::isalnum(c);
-      }).base(), quantType.end());
+          // Free the model
+          llama_model_free(model);
+          // Schedule success callback on JS thread to create JSI objects
+          invoker->invokeAsync([selfPtr, resolve, n_params, n_vocab, n_context, n_embd, description, gpuSupported, optimalGpuLayers, quantType, runtimePtr]() {
+            try {
+              // Create result object on JS thread
+              jsi::Object result(*runtimePtr);
+              result.setProperty(*runtimePtr, "n_params", jsi::Value(n_params));
+              result.setProperty(*runtimePtr, "n_vocab", jsi::Value(n_vocab));
+              result.setProperty(*runtimePtr, "n_context", jsi::Value(n_context));
+              result.setProperty(*runtimePtr, "n_embd", jsi::Value(n_embd));
+              result.setProperty(*runtimePtr, "description", jsi::String::createFromUtf8(*runtimePtr, description));
+              result.setProperty(*runtimePtr, "gpuSupported", jsi::Value(gpuSupported));
+              result.setProperty(*runtimePtr, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
+              result.setProperty(*runtimePtr, "quant_type", jsi::String::createFromUtf8(*runtimePtr, quantType));
+              result.setProperty(*runtimePtr, "architecture", jsi::String::createFromUtf8(*runtimePtr, "Unknown"));
+              resolve->call(*runtimePtr, result);
+            } catch (const std::exception& e) {
+              // If conversion fails, create a simple error response
+              jsi::Object errorObj(*runtimePtr);
+              errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
+              resolve->call(*runtimePtr, errorObj);
+            }
+          });
+        } catch (const std::exception& e) {
+          // Schedule error callback on JS thread
+          std::string errorMsg(e.what());
+          invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
+            try {
+              reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
+            } catch (...) {
+              // Ignore rejection errors
+            }
+          });
+        }
+      }).detach();
+      return jsi::Value::undefined();
     }
-    result.setProperty(runtime, "quant_type", jsi::String::createFromUtf8(runtime, quantType));
-    // Add architecture info
-    result.setProperty(runtime, "architecture",
-                      jsi::String::createFromUtf8(runtime, "Unknown"));
-    // Free the model
-    llama_model_free(model);
-    return result;
-  } catch (const std::exception& e) {
-    jsi::Object error(runtime);
-    error.setProperty(runtime, "message", jsi::String::createFromUtf8(runtime, e.what()));
-    throw jsi::JSError(runtime, error.getProperty(runtime, "message").asString(runtime));
-  }
+  );
+  return Promise.callAsConstructor(runtime, std::move(executor));
 }
 jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  try {
-    // Get model path - required (preserve custom path handling)
-    if (!options.hasProperty(runtime, "model")) {
-      throw std::runtime_error("model path is required");
-    }
-    // Initialize llama backend
-    llama_backend_init();
-    std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
-    SystemUtils::normalizeFilePath(model_path);
-    // Initialize params with defaults
-    rn_common_params params;
-    // Set default sampling parameters
-    params.sampling = common_params_sampling();
-    // Set model path
-    params.model.path = model_path;
-    // Override defaults with user settings if provided
-    SystemUtils::setIfExists(runtime, options, "n_ctx", params.n_ctx);
-    SystemUtils::setIfExists(runtime, options, "n_batch", params.n_batch);
-    SystemUtils::setIfExists(runtime, options, "n_ubatch", params.n_ubatch);
-    SystemUtils::setIfExists(runtime, options, "n_keep", params.n_keep);
-    // Memory and resource options - MUST respect user settings
-    SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
-    SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
-    SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
-    // Extract threading parameters (preserve custom thread logic)
-    int n_threads = 0; // 0 = auto
-    if (options.hasProperty(runtime, "n_threads")) {
-      n_threads = options.getProperty(runtime, "n_threads").asNumber();
-    } else {
-      n_threads = SystemUtils::getOptimalThreadCount();
-    }
-    params.cpuparams.n_threads = n_threads;
-    // Set n_gpu_layers (preserve custom GPU logic)
-    int n_gpu_layers = 0;
-    bool gpuSupported = llama_supports_gpu_offload();
-    if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
-      n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
-    }
-    params.n_gpu_layers = n_gpu_layers;
-    // Additional model parameters
-    SystemUtils::setIfExists(runtime, options, "logits_file", params.logits_file);
-    SystemUtils::setIfExists(runtime, options, "embedding", params.embedding);
-    SystemUtils::setIfExists(runtime, options, "rope_freq_base", params.rope_freq_base);
-    SystemUtils::setIfExists(runtime, options, "rope_freq_scale", params.rope_freq_scale);
-    // Sampling parameters
-    SystemUtils::setIfExists(runtime, options, "seed", params.sampling.seed);
+  // Parse JSI arguments to native types on JSI thread
+  if (!options.hasProperty(runtime, "model")) {
+    throw jsi::JSError(runtime, "model path is required");
+  }
-    // Other system parameters
-    SystemUtils::setIfExists(runtime, options, "verbose", params.verbosity);
+  if (!jsInvoker_) {
+    // Fallback to synchronous if no CallInvoker available - this should not happen normally
+    throw jsi::JSError(runtime, "CallInvoker not available for async operation");
+  }
-    // RoPE settings if provided
-    if (options.hasProperty(runtime, "yarn_ext_factor")) {
-      params.yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_attn_factor")) {
-      params.yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_beta_fast")) {
-      params.yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_beta_slow")) {
-      params.yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
-    }
+  // Parse all options to native types on JSI thread
+  std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
+  SystemUtils::normalizeFilePath(model_path);
+  // Parse all numeric/boolean options to native types
+  int n_ctx = 2048;  // defaults
+  int n_batch = 512;
+  int n_ubatch = 512;
+  int n_keep = 0;
+  bool use_mmap = true;
+  bool use_mlock = false;
+  bool use_jinja = false;
+  bool embedding = false;
+  int n_threads = 0;
+  int n_gpu_layers = 0;
+  std::string logits_file;
+  float rope_freq_base = 10000.0f;
+  float rope_freq_scale = 1.0f;
+  uint32_t seed = 4294967295U; // default seed
+  int verbosity = 0;
+  float yarn_ext_factor = 1.0f;
+  float yarn_attn_factor = 1.0f;
+  float yarn_beta_fast = 32.0f;
+  float yarn_beta_slow = 1.0f;
+  std::string chat_template;
+  // Parse options to native types
+  SystemUtils::setIfExists(runtime, options, "n_ctx", n_ctx);
+  SystemUtils::setIfExists(runtime, options, "n_batch", n_batch);
+  SystemUtils::setIfExists(runtime, options, "n_ubatch", n_ubatch);
+  SystemUtils::setIfExists(runtime, options, "n_keep", n_keep);
+  SystemUtils::setIfExists(runtime, options, "use_mmap", use_mmap);
+  SystemUtils::setIfExists(runtime, options, "use_mlock", use_mlock);
+  SystemUtils::setIfExists(runtime, options, "use_jinja", use_jinja);
+  SystemUtils::setIfExists(runtime, options, "embedding", embedding);
+  SystemUtils::setIfExists(runtime, options, "rope_freq_base", rope_freq_base);
+  SystemUtils::setIfExists(runtime, options, "rope_freq_scale", rope_freq_scale);
+  SystemUtils::setIfExists(runtime, options, "seed", seed);
+  SystemUtils::setIfExists(runtime, options, "verbose", verbosity);
+  SystemUtils::setIfExists(runtime, options, "logits_file", logits_file);
+  SystemUtils::setIfExists(runtime, options, "chat_template", chat_template);
+  if (options.hasProperty(runtime, "n_threads")) {
+    n_threads = options.getProperty(runtime, "n_threads").asNumber();
+  } else {
+    n_threads = SystemUtils::getOptimalThreadCount();
+  }
-    // Support for chat template override
-    std::string chat_template;
-    if (SystemUtils::setIfExists(runtime, options, "chat_template", chat_template)) {
-      params.chat_template = chat_template;
-    }
+  bool gpuSupported = llama_supports_gpu_offload();
+  if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
+    n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
+  }
-    // Support for LoRA adapters
-    if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
-      jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
-      if (lora_obj.isArray(runtime)) {
-        jsi::Array lora_array = lora_obj.asArray(runtime);
-        size_t n_lora = lora_array.size(runtime);
-        for (size_t i = 0; i < n_lora; i++) {
-          if (lora_array.getValueAtIndex(runtime, i).isObject()) {
-            jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
-            if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
-              common_adapter_lora_info lora;
-              lora.path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
-              // Get scale if provided
-              lora.scale = 1.0f; // Default scale
-              if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
-                lora.scale = adapter.getProperty(runtime, "scale").asNumber();
-              }
+  if (options.hasProperty(runtime, "yarn_ext_factor")) {
+    yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_attn_factor")) {
+    yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_beta_fast")) {
+    yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_beta_slow")) {
+    yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
+  }
-              params.lora_adapters.push_back(lora);
+  // Parse LoRA adapters to native structure
+  std::vector<std::pair<std::string, float>> lora_adapters;
+  if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
+    jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
+    if (lora_obj.isArray(runtime)) {
+      jsi::Array lora_array = lora_obj.asArray(runtime);
+      size_t n_lora = lora_array.size(runtime);
+      for (size_t i = 0; i < n_lora; i++) {
+        if (lora_array.getValueAtIndex(runtime, i).isObject()) {
+          jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
+          if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
+            std::string lora_path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
+            float lora_scale = 1.0f; // Default scale
+            if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
+              lora_scale = adapter.getProperty(runtime, "scale").asNumber();
             }
+            lora_adapters.emplace_back(lora_path, lora_scale);
           }
         }
       }
     }
+  }
-    // Initialize using common_init_from_params
-    common_init_result result;
-    try {
-      result = common_init_from_params(params);
+  // Create Promise constructor
+  auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
+  auto executor = jsi::Function::createFromHostFunction(
+    runtime,
+    jsi::PropNameID::forAscii(runtime, "executor"),
+    2,
+    [this, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
-      // Check if initialization was successful
-      if (!result.model || !result.context) {
-        throw std::runtime_error("Failed to initialize model and context");
-      }
-    } catch (const std::exception& e) {
-      // If we were trying to use GPU and got a Vulkan/shader error, retry with CPU-only
-      if (params.n_gpu_layers > 0) {
-        // Other GPU error, still try CPU fallback
-        fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
-        params.n_gpu_layers = 0;
+      auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
+      auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
+      // Create shared references to runtime and invoker for thread safety
+      auto runtimePtr = &runtime;
+      auto invoker = jsInvoker_;
+      auto selfPtr = shared_from_this();
+      // Launch background thread for model initialization
+      std::thread([selfPtr, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters, resolve, reject, runtimePtr, invoker]() {
         try {
-          result = common_init_from_params(params);
-          if (!result.model || !result.context) {
-            throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
+          // Thread-safe access to member variables
+          std::lock_guard<std::mutex> lock(selfPtr->mutex_);
+          // Initialize llama backend
+          llama_backend_init();
+          // Initialize params with defaults
+          rn_common_params params;
+          // Set default sampling parameters
+          params.sampling = common_params_sampling();
+          // Set all parsed native values
+          params.model.path = model_path;
+          params.n_ctx = n_ctx;
+          params.n_batch = n_batch;
+          params.n_ubatch = n_ubatch;
+          params.n_keep = n_keep;
+          params.use_mmap = use_mmap;
+          params.use_mlock = use_mlock;
+          params.use_jinja = use_jinja;
+          params.embedding = embedding;
+          params.cpuparams.n_threads = n_threads;
+          params.n_gpu_layers = n_gpu_layers;
+          params.logits_file = logits_file;
+          params.rope_freq_base = rope_freq_base;
+          params.rope_freq_scale = rope_freq_scale;
+          params.sampling.seed = seed;
+          params.verbosity = verbosity;
+          params.yarn_ext_factor = yarn_ext_factor;
+          params.yarn_attn_factor = yarn_attn_factor;
+          params.yarn_beta_fast = yarn_beta_fast;
+          params.yarn_beta_slow = yarn_beta_slow;
+          if (!chat_template.empty()) {
+            params.chat_template = chat_template;
           }
+          // Add LoRA adapters
+          for (const auto& lora : lora_adapters) {
+            common_adapter_lora_info lora_info;
+            lora_info.path = lora.first;
+            lora_info.scale = lora.second;
+            params.lora_adapters.push_back(lora_info);
+          }
+          // Initialize using common_init_from_params
+          common_init_result result;
-          fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
-        } catch (const std::exception& cpu_e) {
-          throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
-        }
-      } else {
-        // Was already CPU-only, re-throw the original error
-        throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
-      }
-    }
+          try {
+            result = common_init_from_params(params);
+            // Check if initialization was successful
+            if (!result.model || !result.context) {
+              throw std::runtime_error("Failed to initialize model and context");
+            }
+          } catch (const std::exception& e) {
+            // If we were trying to use GPU and got an error, retry with CPU-only
+            if (params.n_gpu_layers > 0) {
+              fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
+              params.n_gpu_layers = 0;
+              try {
+                result = common_init_from_params(params);
+                if (!result.model || !result.context) {
+                  throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
+                }
+                fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
+              } catch (const std::exception& cpu_e) {
+                throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
+              }
+            } else {
+              // Was already CPU-only, re-throw the original error
+              throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
+            }
+          }
-    // Create and initialize rn_llama_context
-    rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
-    rn_ctx_->model = result.model.release();
-    rn_ctx_->ctx = result.context.release();
-    rn_ctx_->model_loaded = true;
-    rn_ctx_->vocab = llama_model_get_vocab(rn_ctx_->model);
-    // Create a rn_common_params from the common_params
-    rn_common_params rn_params;
-    // Copy the base class fields
-    static_cast<common_params&>(rn_params) = params;
-    // Set additional fields
-    rn_params.use_jinja = params.use_jinja;
-    rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
-    // Use chatml format by default instead of content-only for better tool support
-    rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
-    // Now assign to the context
-    rn_ctx_->params = rn_params;
-    // Initialize chat templates with proper error handling
-    try {
-        // Get BOS and EOS tokens if provided in options
-        std::string bos_token_override;
-        std::string eos_token_override;
-        SystemUtils::setIfExists(runtime, options, "bos_token", bos_token_override);
-        SystemUtils::setIfExists(runtime, options, "eos_token", eos_token_override);
-        rn_ctx_->chat_templates = common_chat_templates_init(
-            rn_ctx_->model,
-            params.chat_template,
-            bos_token_override,
-            eos_token_override
-        );
-        if (!rn_ctx_->chat_templates) {
-            throw std::runtime_error("Failed to initialize chat templates");
-        }
-    } catch (const std::exception& e) {
-        // Log warning and fallback to chatml
-        fprintf(stderr, "Warning: Failed to initialize chat template: %s. Falling back to chatml.\n", e.what());
-        rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, "chatml");
-        if (!rn_ctx_->chat_templates) {
-            throw std::runtime_error("Failed to initialize fallback chatml template");
+          // Create and initialize rn_llama_context
+          selfPtr->rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
+          selfPtr->rn_ctx_->model = result.model.release();
+          selfPtr->rn_ctx_->ctx = result.context.release();
+          selfPtr->rn_ctx_->model_loaded = true;
+          selfPtr->rn_ctx_->vocab = llama_model_get_vocab(selfPtr->rn_ctx_->model);
+          // Create a rn_common_params from the common_params
+          rn_common_params rn_params;
+          // Copy the base class fields
+          static_cast<common_params&>(rn_params) = params;
+          // Set additional fields
+          rn_params.use_jinja = params.use_jinja;
+          rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+          // Now assign to the context
+          selfPtr->rn_ctx_->params = rn_params;
+          selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
+          try {
+              common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja);
+          } catch (const std::exception & e) {
+              // Fallback to chatml if the original template parsing fails
+              selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
+          }
+          // Schedule success callback on JS thread to create JSI objects
+          invoker->invokeAsync([selfPtr, resolve, runtimePtr]() {
+            try {
+              // Create the model object and resolve Promise on JS thread
+              jsi::Object modelObject = selfPtr->createModelObject(*runtimePtr, selfPtr->rn_ctx_.get());
+              resolve->call(*runtimePtr, modelObject);
+            } catch (const std::exception& e) {
+              // If conversion fails, create a simple error response
+              jsi::Object errorObj(*runtimePtr);
+              errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
+              resolve->call(*runtimePtr, errorObj);
+            }
+          });
+        } catch (const std::exception& e) {
+          // Schedule error callback on JS thread
+          std::string errorMsg(e.what());
+          fprintf(stderr, "initLlama error: %s\n", errorMsg.c_str());
+          invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
+            try {
+              reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
+            } catch (...) {
+              // Ignore rejection errors
+            }
+          });
         }
+      }).detach();
+      return jsi::Value::undefined();
     }
-    // Create the model object and return it
-    return createModelObject(runtime, rn_ctx_.get());
-  } catch (const std::exception& e) {
-    fprintf(stderr, "initLlama error: %s\n", e.what());
-    throw jsi::JSError(runtime, e.what());
-  }
+  );
+  return Promise.callAsConstructor(runtime, std::move(executor));
 }
 jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {

package/cpp/PureCppImpl.h CHANGED Viewed

@@ -28,7 +28,7 @@ class LlamaCppModel;     // Forward declare LlamaCppModel
 namespace facebook::react {
 // Note: The class name is PureCppImpl, and it derives from your project's C++ spec
-class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl> {
+class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl>, public std::enable_shared_from_this<PureCppImpl> {
 public:
     // Constructor
     PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker);

package/cpp/SystemUtils.h CHANGED Viewed

@@ -44,8 +44,8 @@ public:
    * Helper functions to easily set values from a JSI object if the property exists.
    * Returns true if the property was found and the value was set.
    */
-  // Template for all numeric types
-  template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
+  // Template for numeric types (excluding bool so bool specialization is used)
+  template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value && !std::is_same<T, bool>::value>::type>
   static bool setIfExists(jsi::Runtime& rt, const jsi::Object& options, const std::string& key, T& outValue) {
     if (options.hasProperty(rt, key.c_str())) {
       jsi::Value val = options.getProperty(rt, key.c_str());

package/cpp/rn-completion.cpp CHANGED Viewed

@@ -350,7 +350,7 @@ CompletionResult run_chat_completion(
         common_chat_templates_inputs template_inputs;
         template_inputs.messages = chat_msgs;
         template_inputs.add_generation_prompt = true;
-        template_inputs.use_jinja = options.use_jinja;
+        template_inputs.use_jinja = rn_ctx->params.use_jinja;
         // Note: extract_reasoning field doesn't exist in current llama.cpp version
         // template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
@@ -391,6 +391,31 @@ CompletionResult run_chat_completion(
         result = run_completion(rn_ctx, cmpl_options, callback);
         if (result.success) {
+            // Parse the generated content for tool calls and structured responses
+            common_chat_msg parsed_msg;
+            bool has_parsed_content = false;
+            // Only parse if we have tools available and the response isn't empty
+            if (!template_inputs.tools.empty() && !result.content.empty()) {
+                try {
+                    // Construct the chat syntax for parsing using the format from template application
+                    common_chat_syntax syntax;
+                    syntax.format = chat_params.format;  // Use format from template, not from params
+                    syntax.reasoning_format = rn_ctx->params.reasoning_format;
+                    syntax.reasoning_in_content = true;
+                    syntax.thinking_forced_open = false;
+                    syntax.parse_tool_calls = true;
+                    // Parse the generated content for tool calls
+                    parsed_msg = common_chat_parse(result.content, false, syntax);
+                    has_parsed_content = true;
+                } catch (const std::exception& e) {
+                    // If parsing fails, treat as regular content
+                    has_parsed_content = false;
+                }
+            }
             // Create OpenAI-compatible response
             json response = {
                 {"id", gen_chatcmplid()},
@@ -403,11 +428,39 @@ CompletionResult run_chat_completion(
             json choice = {
                 {"index", 0},
                 {"message", {
-                    {"role", "assistant"},
-                    {"content", result.content}
+                    {"role", "assistant"}
                 }},
                 {"finish_reason", "stop"}
             };
+            // Add parsed content and tool calls if available
+            if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
+                // Set content to the parsed content (may be null for tool-only responses)
+                if (!parsed_msg.content.empty()) {
+                    choice["message"]["content"] = parsed_msg.content;
+                } else {
+                    choice["message"]["content"] = nullptr;
+                }
+                // Add tool calls to the message
+                json tool_calls = json::array();
+                for (const auto& tool_call : parsed_msg.tool_calls) {
+                    json tc = {
+                        {"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", tool_call.name},
+                            {"arguments", tool_call.arguments}
+                        }}
+                    };
+                    tool_calls.push_back(tc);
+                }
+                choice["message"]["tool_calls"] = tool_calls;
+                choice["finish_reason"] = "tool_calls";
+            } else {
+                // Regular text response
+                choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
+            }
             choices.push_back(choice);
             response["choices"] = choices;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@novastera-oss/llamarn",
-  "version": "0.2.2",
+  "version": "0.2.4",
   "description": "An attempt at a pure cpp turbo module library",
   "source": "./src/index.tsx",
   "main": "./lib/module/index.js",

package/src/NativeRNLlamaCpp.ts CHANGED Viewed

@@ -234,7 +234,7 @@ export interface Spec extends TurboModule {
   // Initialize a Llama context with the given model parameters
   initLlama(params: LlamaModelParams): Promise<LlamaContextType & LlamaContextMethods>;
-  // Load model info without creating a full contex
+  // Load model info without creating a full context
   loadLlamaModelInfo(modelPath: string): Promise<{
     n_params: number;
     n_vocab: number;