npm - @novastera-oss/llamarn - Versions diffs - 0.2.3 → 0.2.5 - Mend

@novastera-oss/llamarn 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/cpp/PureCppImpl.cpp CHANGED Viewed

@@ -43,267 +43,389 @@ double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
 }
 jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String modelPath) {
+  // Parse JSI arguments to native types on JSI thread
   std::string path = modelPath.utf8(runtime);
   SystemUtils::normalizeFilePath(path);
-  try {
-    // Initialize llama backend
-    llama_backend_init();
-    // Create model params
-    llama_model_params params = llama_model_default_params();
-    params.n_gpu_layers = 0; // Use CPU for model info loading
-    // Load the model
-    llama_model* model = llama_model_load_from_file(path.c_str(), params);
-    if (!model) {
-      throw std::runtime_error("Failed to load model from file: " + path);
-    }
-    // Create result object
-    jsi::Object result(runtime);
+  if (!jsInvoker_) {
+    // Fallback to synchronous if no CallInvoker available - this should not happen normally
+    throw jsi::JSError(runtime, "CallInvoker not available for async operation");
+  }
-    // Get model parameters
-    result.setProperty(runtime, "n_params", jsi::Value((double)llama_model_n_params(model)));
+  // Create Promise constructor
+  auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
+  auto executor = jsi::Function::createFromHostFunction(
+    runtime,
+    jsi::PropNameID::forAscii(runtime, "executor"),
+    2,
+    [this, path](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
+      auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
+      auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
+      // Create shared references to runtime and invoker for thread safety
+      auto runtimePtr = &runtime;
+      auto invoker = jsInvoker_;
+      auto selfPtr = shared_from_this();
+      // Launch background thread for model info loading
+      std::thread([selfPtr, path, resolve, reject, runtimePtr, invoker]() {
+        try {
+          // Initialize llama backend
+          llama_backend_init();
-    // Get vocabulary
-    const llama_vocab* vocab = llama_model_get_vocab(model);
-    result.setProperty(runtime, "n_vocab", jsi::Value((double)llama_vocab_n_tokens(vocab)));
+          // Create model params
+          llama_model_params params = llama_model_default_params();
+          params.n_gpu_layers = 0; // Use CPU for model info loading
-    // Get context size
-    result.setProperty(runtime, "n_context", jsi::Value((double)llama_model_n_ctx_train(model)));
+          // Load the model
+          llama_model* model = llama_model_load_from_file(path.c_str(), params);
-    // Get embedding size
-    result.setProperty(runtime, "n_embd", jsi::Value((double)llama_model_n_embd(model)));
+          if (!model) {
+            throw std::runtime_error("Failed to load model from file: " + path);
+          }
-    // Get model description
-    char buf[512];
-    llama_model_desc(model, buf, sizeof(buf));
-    result.setProperty(runtime, "description",
-                      jsi::String::createFromUtf8(runtime, buf[0] ? buf : "Unknown model"));
+          // Get model information (native types)
+          double n_params = (double)llama_model_n_params(model);
+          const llama_vocab* vocab = llama_model_get_vocab(model);
+          double n_vocab = (double)llama_vocab_n_tokens(vocab);
+          double n_context = (double)llama_model_n_ctx_train(model);
+          double n_embd = (double)llama_model_n_embd(model);
+          // Get model description
+          char buf[512];
+          llama_model_desc(model, buf, sizeof(buf));
+          std::string description = buf[0] ? buf : "Unknown model";
+          // Check if GPU is supported
+          bool gpuSupported = llama_supports_gpu_offload();
+          // Calculate optimal GPU layers if GPU is supported
+          int optimalGpuLayers = 0;
+          if (gpuSupported) {
+            optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
+          }
-    // Check if GPU is supported
-    bool gpuSupported = llama_supports_gpu_offload();
-    result.setProperty(runtime, "gpuSupported", jsi::Value(gpuSupported));
+          // Extract quantization type from model description
+          std::string desc(buf);
+          std::string quantType = "Unknown";
+          size_t qPos = desc.find(" Q");
+          if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
+            // Extract quantization string (like Q4_K, Q5_K, Q8_0)
+            quantType = desc.substr(qPos + 1, 4);
+            // Remove any trailing non-alphanumeric characters
+            quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
+              return std::isalnum(c);
+            }).base(), quantType.end());
+          }
-    // Calculate optimal GPU layers if GPU is supported
-    int optimalGpuLayers = 0;
-    if (gpuSupported) {
-      optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
-    }
-    result.setProperty(runtime, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
-    // Extract quantization type from model description
-    std::string desc(buf);
-    std::string quantType = "Unknown";
-    size_t qPos = desc.find(" Q");
-    if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
-      // Extract quantization string (like Q4_K, Q5_K, Q8_0)
-      quantType = desc.substr(qPos + 1, 4);
-      // Remove any trailing non-alphanumeric characters
-      quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
-        return std::isalnum(c);
-      }).base(), quantType.end());
+          // Free the model
+          llama_model_free(model);
+          // Schedule success callback on JS thread to create JSI objects
+          invoker->invokeAsync([selfPtr, resolve, n_params, n_vocab, n_context, n_embd, description, gpuSupported, optimalGpuLayers, quantType, runtimePtr]() {
+            try {
+              // Create result object on JS thread
+              jsi::Object result(*runtimePtr);
+              result.setProperty(*runtimePtr, "n_params", jsi::Value(n_params));
+              result.setProperty(*runtimePtr, "n_vocab", jsi::Value(n_vocab));
+              result.setProperty(*runtimePtr, "n_context", jsi::Value(n_context));
+              result.setProperty(*runtimePtr, "n_embd", jsi::Value(n_embd));
+              result.setProperty(*runtimePtr, "description", jsi::String::createFromUtf8(*runtimePtr, description));
+              result.setProperty(*runtimePtr, "gpuSupported", jsi::Value(gpuSupported));
+              result.setProperty(*runtimePtr, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
+              result.setProperty(*runtimePtr, "quant_type", jsi::String::createFromUtf8(*runtimePtr, quantType));
+              result.setProperty(*runtimePtr, "architecture", jsi::String::createFromUtf8(*runtimePtr, "Unknown"));
+              resolve->call(*runtimePtr, result);
+            } catch (const std::exception& e) {
+              // If conversion fails, create a simple error response
+              jsi::Object errorObj(*runtimePtr);
+              errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
+              resolve->call(*runtimePtr, errorObj);
+            }
+          });
+        } catch (const std::exception& e) {
+          // Schedule error callback on JS thread
+          std::string errorMsg(e.what());
+          invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
+            try {
+              reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
+            } catch (...) {
+              // Ignore rejection errors
+            }
+          });
+        }
+      }).detach();
+      return jsi::Value::undefined();
     }
-    result.setProperty(runtime, "quant_type", jsi::String::createFromUtf8(runtime, quantType));
-    // Add architecture info
-    result.setProperty(runtime, "architecture",
-                      jsi::String::createFromUtf8(runtime, "Unknown"));
-    // Free the model
-    llama_model_free(model);
-    return result;
-  } catch (const std::exception& e) {
-    jsi::Object error(runtime);
-    error.setProperty(runtime, "message", jsi::String::createFromUtf8(runtime, e.what()));
-    throw jsi::JSError(runtime, error.getProperty(runtime, "message").asString(runtime));
-  }
+  );
+  return Promise.callAsConstructor(runtime, std::move(executor));
 }
 jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  try {
-    // Get model path - required (preserve custom path handling)
-    if (!options.hasProperty(runtime, "model")) {
-      throw std::runtime_error("model path is required");
-    }
-    // Initialize llama backend
-    llama_backend_init();
-    std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
-    SystemUtils::normalizeFilePath(model_path);
-    // Initialize params with defaults
-    rn_common_params params;
-    // Set default sampling parameters
-    params.sampling = common_params_sampling();
-    // Set model path
-    params.model.path = model_path;
-    // Override defaults with user settings if provided
-    SystemUtils::setIfExists(runtime, options, "n_ctx", params.n_ctx);
-    SystemUtils::setIfExists(runtime, options, "n_batch", params.n_batch);
-    SystemUtils::setIfExists(runtime, options, "n_ubatch", params.n_ubatch);
-    SystemUtils::setIfExists(runtime, options, "n_keep", params.n_keep);
-    // Memory and resource options - MUST respect user settings
-    SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
-    SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
-    SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
-    // Extract threading parameters (preserve custom thread logic)
-    int n_threads = 0; // 0 = auto
-    if (options.hasProperty(runtime, "n_threads")) {
-      n_threads = options.getProperty(runtime, "n_threads").asNumber();
-    } else {
-      n_threads = SystemUtils::getOptimalThreadCount();
-    }
-    params.cpuparams.n_threads = n_threads;
-    // Set n_gpu_layers (preserve custom GPU logic)
-    int n_gpu_layers = 0;
-    bool gpuSupported = llama_supports_gpu_offload();
-    if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
-      n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
-    }
-    params.n_gpu_layers = n_gpu_layers;
-    // Additional model parameters
-    SystemUtils::setIfExists(runtime, options, "logits_file", params.logits_file);
-    SystemUtils::setIfExists(runtime, options, "embedding", params.embedding);
-    SystemUtils::setIfExists(runtime, options, "rope_freq_base", params.rope_freq_base);
-    SystemUtils::setIfExists(runtime, options, "rope_freq_scale", params.rope_freq_scale);
-    // Sampling parameters
-    SystemUtils::setIfExists(runtime, options, "seed", params.sampling.seed);
+  // Parse JSI arguments to native types on JSI thread
+  if (!options.hasProperty(runtime, "model")) {
+    throw jsi::JSError(runtime, "model path is required");
+  }
-    // Other system parameters
-    SystemUtils::setIfExists(runtime, options, "verbose", params.verbosity);
+  if (!jsInvoker_) {
+    // Fallback to synchronous if no CallInvoker available - this should not happen normally
+    throw jsi::JSError(runtime, "CallInvoker not available for async operation");
+  }
-    // RoPE settings if provided
-    if (options.hasProperty(runtime, "yarn_ext_factor")) {
-      params.yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_attn_factor")) {
-      params.yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_beta_fast")) {
-      params.yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
-    }
-    if (options.hasProperty(runtime, "yarn_beta_slow")) {
-      params.yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
-    }
+  // Parse all options to native types on JSI thread
+  std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
+  SystemUtils::normalizeFilePath(model_path);
+  // Parse all numeric/boolean options to native types
+  int n_ctx = 2048;  // defaults
+  int n_batch = 512;
+  int n_ubatch = 512;
+  int n_keep = 0;
+  bool use_mmap = true;
+  bool use_mlock = false;
+  bool use_jinja = false;
+  bool embedding = false;
+  int n_threads = 0;
+  int n_gpu_layers = 0;
+  std::string logits_file;
+  float rope_freq_base = 10000.0f;
+  float rope_freq_scale = 1.0f;
+  uint32_t seed = 4294967295U; // default seed
+  int verbosity = 0;
+  float yarn_ext_factor = 1.0f;
+  float yarn_attn_factor = 1.0f;
+  float yarn_beta_fast = 32.0f;
+  float yarn_beta_slow = 1.0f;
+  std::string chat_template;
+  // Parse options to native types
+  SystemUtils::setIfExists(runtime, options, "n_ctx", n_ctx);
+  SystemUtils::setIfExists(runtime, options, "n_batch", n_batch);
+  SystemUtils::setIfExists(runtime, options, "n_ubatch", n_ubatch);
+  SystemUtils::setIfExists(runtime, options, "n_keep", n_keep);
+  SystemUtils::setIfExists(runtime, options, "use_mmap", use_mmap);
+  SystemUtils::setIfExists(runtime, options, "use_mlock", use_mlock);
+  SystemUtils::setIfExists(runtime, options, "use_jinja", use_jinja);
+  SystemUtils::setIfExists(runtime, options, "embedding", embedding);
+  SystemUtils::setIfExists(runtime, options, "rope_freq_base", rope_freq_base);
+  SystemUtils::setIfExists(runtime, options, "rope_freq_scale", rope_freq_scale);
+  SystemUtils::setIfExists(runtime, options, "seed", seed);
+  SystemUtils::setIfExists(runtime, options, "verbose", verbosity);
+  SystemUtils::setIfExists(runtime, options, "logits_file", logits_file);
+  SystemUtils::setIfExists(runtime, options, "chat_template", chat_template);
+  if (options.hasProperty(runtime, "n_threads")) {
+    n_threads = options.getProperty(runtime, "n_threads").asNumber();
+  } else {
+    n_threads = SystemUtils::getOptimalThreadCount();
+  }
-    // Support for chat template override
-    std::string chat_template;
-    if (SystemUtils::setIfExists(runtime, options, "chat_template", chat_template)) {
-      params.chat_template = chat_template;
-    }
+  bool gpuSupported = llama_supports_gpu_offload();
+  if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
+    n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
+  }
-    // Support for LoRA adapters
-    if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
-      jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
-      if (lora_obj.isArray(runtime)) {
-        jsi::Array lora_array = lora_obj.asArray(runtime);
-        size_t n_lora = lora_array.size(runtime);
-        for (size_t i = 0; i < n_lora; i++) {
-          if (lora_array.getValueAtIndex(runtime, i).isObject()) {
-            jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
-            if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
-              common_adapter_lora_info lora;
-              lora.path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
-              // Get scale if provided
-              lora.scale = 1.0f; // Default scale
-              if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
-                lora.scale = adapter.getProperty(runtime, "scale").asNumber();
-              }
+  if (options.hasProperty(runtime, "yarn_ext_factor")) {
+    yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_attn_factor")) {
+    yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_beta_fast")) {
+    yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
+  }
+  if (options.hasProperty(runtime, "yarn_beta_slow")) {
+    yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
+  }
-              params.lora_adapters.push_back(lora);
+  // Parse LoRA adapters to native structure
+  std::vector<std::pair<std::string, float>> lora_adapters;
+  if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
+    jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
+    if (lora_obj.isArray(runtime)) {
+      jsi::Array lora_array = lora_obj.asArray(runtime);
+      size_t n_lora = lora_array.size(runtime);
+      for (size_t i = 0; i < n_lora; i++) {
+        if (lora_array.getValueAtIndex(runtime, i).isObject()) {
+          jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
+          if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
+            std::string lora_path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
+            float lora_scale = 1.0f; // Default scale
+            if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
+              lora_scale = adapter.getProperty(runtime, "scale").asNumber();
             }
+            lora_adapters.emplace_back(lora_path, lora_scale);
           }
         }
       }
     }
+  }
-    // Initialize using common_init_from_params
-    common_init_result result;
-    try {
-      result = common_init_from_params(params);
+  // Create Promise constructor
+  auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
+  auto executor = jsi::Function::createFromHostFunction(
+    runtime,
+    jsi::PropNameID::forAscii(runtime, "executor"),
+    2,
+    [this, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
-      // Check if initialization was successful
-      if (!result.model || !result.context) {
-        throw std::runtime_error("Failed to initialize model and context");
-      }
-    } catch (const std::exception& e) {
-      // If we were trying to use GPU and got a Vulkan/shader error, retry with CPU-only
-      if (params.n_gpu_layers > 0) {
-        // Other GPU error, still try CPU fallback
-        fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
-        params.n_gpu_layers = 0;
+      auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
+      auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
+      // Create shared references to runtime and invoker for thread safety
+      auto runtimePtr = &runtime;
+      auto invoker = jsInvoker_;
+      auto selfPtr = shared_from_this();
+      // Launch background thread for model initialization
+      std::thread([selfPtr, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters, resolve, reject, runtimePtr, invoker]() {
         try {
-          result = common_init_from_params(params);
+          // Thread-safe access to member variables
+          std::lock_guard<std::mutex> lock(selfPtr->mutex_);
+          // Initialize llama backend
+          llama_backend_init();
+          // Initialize params with defaults
+          rn_common_params params;
+          // Set default sampling parameters
+          params.sampling = common_params_sampling();
+          // Set all parsed native values
+          params.model.path = model_path;
+          params.n_ctx = n_ctx;
+          params.n_batch = n_batch;
+          params.n_ubatch = n_ubatch;
+          params.n_keep = n_keep;
+          params.use_mmap = use_mmap;
+          params.use_mlock = use_mlock;
+          params.use_jinja = use_jinja;
+          params.embedding = embedding;
+          params.cpuparams.n_threads = n_threads;
+          params.n_gpu_layers = n_gpu_layers;
+          params.logits_file = logits_file;
+          params.rope_freq_base = rope_freq_base;
+          params.rope_freq_scale = rope_freq_scale;
+          params.sampling.seed = seed;
+          params.verbosity = verbosity;
+          params.yarn_ext_factor = yarn_ext_factor;
+          params.yarn_attn_factor = yarn_attn_factor;
+          params.yarn_beta_fast = yarn_beta_fast;
+          params.yarn_beta_slow = yarn_beta_slow;
+          if (!chat_template.empty()) {
+            params.chat_template = chat_template;
+          }
+          // Add LoRA adapters
+          for (const auto& lora : lora_adapters) {
+            common_adapter_lora_info lora_info;
+            lora_info.path = lora.first;
+            lora_info.scale = lora.second;
+            params.lora_adapters.push_back(lora_info);
+          }
+          // Initialize using common_init_from_params
+          common_init_result result;
-          if (!result.model || !result.context) {
-            throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
+          try {
+            result = common_init_from_params(params);
+            // Check if initialization was successful
+            if (!result.model || !result.context) {
+              throw std::runtime_error("Failed to initialize model and context");
+            }
+          } catch (const std::exception& e) {
+            // If we were trying to use GPU and got an error, retry with CPU-only
+            if (params.n_gpu_layers > 0) {
+              fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
+              params.n_gpu_layers = 0;
+              try {
+                result = common_init_from_params(params);
+                if (!result.model || !result.context) {
+                  throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
+                }
+                fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
+              } catch (const std::exception& cpu_e) {
+                throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
+              }
+            } else {
+              // Was already CPU-only, re-throw the original error
+              throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
+            }
+          }
+          // Create and initialize rn_llama_context
+          selfPtr->rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
+          selfPtr->rn_ctx_->model = result.model.release();
+          selfPtr->rn_ctx_->ctx = result.context.release();
+          selfPtr->rn_ctx_->model_loaded = true;
+          selfPtr->rn_ctx_->vocab = llama_model_get_vocab(selfPtr->rn_ctx_->model);
+          // Create a rn_common_params from the common_params
+          rn_common_params rn_params;
+          // Copy the base class fields
+          static_cast<common_params&>(rn_params) = params;
+          // Set additional fields
+          rn_params.use_jinja = params.use_jinja;
+          rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+          // Now assign to the context
+          selfPtr->rn_ctx_->params = rn_params;
+          selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
+          try {
+              common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja);
+          } catch (const std::exception & e) {
+              // Fallback to chatml if the original template parsing fails
+              selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
           }
+          // Schedule success callback on JS thread to create JSI objects
+          invoker->invokeAsync([selfPtr, resolve, runtimePtr]() {
+            try {
+              // Create the model object and resolve Promise on JS thread
+              jsi::Object modelObject = selfPtr->createModelObject(*runtimePtr, selfPtr->rn_ctx_.get());
+              resolve->call(*runtimePtr, modelObject);
+            } catch (const std::exception& e) {
+              // If conversion fails, create a simple error response
+              jsi::Object errorObj(*runtimePtr);
+              errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
+              resolve->call(*runtimePtr, errorObj);
+            }
+          });
-          fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
-        } catch (const std::exception& cpu_e) {
-          throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
+        } catch (const std::exception& e) {
+          // Schedule error callback on JS thread
+          std::string errorMsg(e.what());
+          fprintf(stderr, "initLlama error: %s\n", errorMsg.c_str());
+          invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
+            try {
+              reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
+            } catch (...) {
+              // Ignore rejection errors
+            }
+          });
         }
-      } else {
-        // Was already CPU-only, re-throw the original error
-        throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
-      }
-    }
-    // Create and initialize rn_llama_context
-    rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
-    rn_ctx_->model = result.model.release();
-    rn_ctx_->ctx = result.context.release();
-    rn_ctx_->model_loaded = true;
-    rn_ctx_->vocab = llama_model_get_vocab(rn_ctx_->model);
-    // Create a rn_common_params from the common_params
-    rn_common_params rn_params;
-    // Copy the base class fields
-    static_cast<common_params&>(rn_params) = params;
-    // Set additional fields
-    rn_params.use_jinja = params.use_jinja;
-    rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
-    // Don't force a specific chat format - let the template system auto-detect based on model and tools
-    // rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
-    // Now assign to the context
-    rn_ctx_->params = rn_params;
-    rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, params.chat_template);
-    try {
-        common_chat_format_example(rn_ctx_->chat_templates.get(), params.use_jinja);
-    } catch (const std::exception & e) {
-        // Fallback to chatml if the original template parsing fails
-        rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, "chatml");
+      }).detach();
+      return jsi::Value::undefined();
     }
-    // Create the model object and return it
-    return createModelObject(runtime, rn_ctx_.get());
-  } catch (const std::exception& e) {
-    // We can keep this top-level error log as it's for initialization failure
-    fprintf(stderr, "initLlama error: %s\n", e.what());
-    throw jsi::JSError(runtime, e.what());
-  }
+  );
+  return Promise.callAsConstructor(runtime, std::move(executor));
 }
 jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {

package/cpp/PureCppImpl.h CHANGED Viewed

@@ -28,7 +28,7 @@ class LlamaCppModel;     // Forward declare LlamaCppModel
 namespace facebook::react {
 // Note: The class name is PureCppImpl, and it derives from your project's C++ spec
-class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl> {
+class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl>, public std::enable_shared_from_this<PureCppImpl> {
 public:
     // Constructor
     PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker);

package/cpp/rn-completion.cpp CHANGED Viewed

@@ -147,30 +147,13 @@ CompletionResult run_completion(
         json data = options.to_json();
         // Prepare the sampling parameters
         const auto& params = rn_ctx->params;
-        // Set the prompt
-        if (data.contains("prompt")) {
-            // Tokenize the prompt
-            const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
-            if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
-                result.success = false;
-                result.error_msg = "Empty prompt";
-                result.error_type = RN_ERROR_INVALID_PARAM;
-                return result;
-            }
-            state.prompt_tokens = std::move(tokenized_prompts[0]);
-        } else {
-            result.success = false;
-            result.error_msg = "No prompt provided";
-            result.error_type = RN_ERROR_INVALID_PARAM;
-            return result;
+        // Create a copy of sampling parameters and apply grammar if provided
+        common_params_sampling sampling_params = params.sampling;
+        if (!options.grammar.empty()) {
+            sampling_params.grammar = options.grammar;
         }
-        // Configure state
-        state.n_ctx = llama_n_ctx(rn_ctx->ctx);
-        state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
-        state.n_remaining = state.n_predict;
         // Parse tool_choice
         if (options.tool_choice == "auto") {
             state.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -179,8 +162,8 @@ CompletionResult run_completion(
         } else if (options.tool_choice == "required") {
             state.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
         }
-        // Initialize the sampler
-        state.sampler = common_sampler_init(rn_ctx->model, params.sampling);
+        // Initialize the sampler with the updated sampling parameters
+        state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
         if (!state.sampler) {
             result.success = false;
             result.error_msg = "Failed to initialize sampler";
@@ -201,6 +184,29 @@ CompletionResult run_completion(
             }
         }
+        // Set the prompt
+        if (data.contains("prompt")) {
+            // Tokenize the prompt
+            const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
+            if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
+                result.success = false;
+                result.error_msg = "Empty prompt";
+                result.error_type = RN_ERROR_INVALID_PARAM;
+                return result;
+            }
+            state.prompt_tokens = std::move(tokenized_prompts[0]);
+        } else {
+            result.success = false;
+            result.error_msg = "No prompt provided";
+            result.error_type = RN_ERROR_INVALID_PARAM;
+            return result;
+        }
+        // Configure state
+        state.n_ctx = llama_n_ctx(rn_ctx->ctx);
+        state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
+        state.n_remaining = state.n_predict;
         // Process the prompt
         for (int i = 0; i < (int)state.prompt_tokens.size(); ++i) {
             llama_token token = state.prompt_tokens[i];
@@ -222,7 +228,11 @@ CompletionResult run_completion(
                 return result;
             }
-            common_sampler_accept(state.sampler, token, true);
+            // Only accept tokens during prompt processing if no grammar is present
+            // Grammar-based sampling needs to start fresh from the generation phase
+            if (sampling_params.grammar.empty()) {
+                common_sampler_accept(state.sampler, token, true);
+            }
             state.n_past++;
         }
@@ -435,31 +445,15 @@ CompletionResult run_chat_completion(
             // Add parsed content and tool calls if available
             if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
-                // Set content to the parsed content (may be null for tool-only responses)
-                if (!parsed_msg.content.empty()) {
-                    choice["message"]["content"] = parsed_msg.content;
-                } else {
-                    choice["message"]["content"] = nullptr;
-                }
-                // Add tool calls to the message
-                json tool_calls = json::array();
-                for (const auto& tool_call : parsed_msg.tool_calls) {
-                    json tc = {
-                        {"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
-                        {"type", "function"},
-                        {"function", {
-                            {"name", tool_call.name},
-                            {"arguments", tool_call.arguments}
-                        }}
-                    };
-                    tool_calls.push_back(tc);
-                }
-                choice["message"]["tool_calls"] = tool_calls;
+                // Use the server.cpp approach: let the common_chat_msg handle the JSON conversion
+                choice["message"] = parsed_msg.to_json_oaicompat<json>();
                 choice["finish_reason"] = "tool_calls";
+            } else if (has_parsed_content && !parsed_msg.content.empty()) {
+                // Regular text response with parsed content
+                choice["message"]["content"] = parsed_msg.content;
             } else {
-                // Regular text response
-                choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
+                // Fallback to raw content if parsing failed or no tools
+                choice["message"]["content"] = result.content;
             }
             choices.push_back(choice);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@novastera-oss/llamarn",
-  "version": "0.2.3",
+  "version": "0.2.5",
   "description": "An attempt at a pure cpp turbo module library",
   "source": "./src/index.tsx",
   "main": "./lib/module/index.js",

package/src/NativeRNLlamaCpp.ts CHANGED Viewed

@@ -234,7 +234,7 @@ export interface Spec extends TurboModule {
   // Initialize a Llama context with the given model parameters
   initLlama(params: LlamaModelParams): Promise<LlamaContextType & LlamaContextMethods>;
-  // Load model info without creating a full contex
+  // Load model info without creating a full context
   loadLlamaModelInfo(modelPath: string): Promise<{
     n_params: number;
     n_vocab: number;