@novastera-oss/llamarn 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,267 +43,389 @@ double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
43
43
  }
44
44
 
45
45
  jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String modelPath) {
46
+ // Parse JSI arguments to native types on JSI thread
46
47
  std::string path = modelPath.utf8(runtime);
47
48
  SystemUtils::normalizeFilePath(path);
48
49
 
49
- try {
50
- // Initialize llama backend
51
- llama_backend_init();
52
-
53
- // Create model params
54
- llama_model_params params = llama_model_default_params();
55
- params.n_gpu_layers = 0; // Use CPU for model info loading
56
-
57
- // Load the model
58
- llama_model* model = llama_model_load_from_file(path.c_str(), params);
59
-
60
- if (!model) {
61
- throw std::runtime_error("Failed to load model from file: " + path);
62
- }
63
-
64
- // Create result object
65
- jsi::Object result(runtime);
50
+ if (!jsInvoker_) {
51
+ // Fallback to synchronous if no CallInvoker available - this should not happen normally
52
+ throw jsi::JSError(runtime, "CallInvoker not available for async operation");
53
+ }
66
54
 
67
- // Get model parameters
68
- result.setProperty(runtime, "n_params", jsi::Value((double)llama_model_n_params(model)));
55
+ // Create Promise constructor
56
+ auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
57
+
58
+ auto executor = jsi::Function::createFromHostFunction(
59
+ runtime,
60
+ jsi::PropNameID::forAscii(runtime, "executor"),
61
+ 2,
62
+ [this, path](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
63
+
64
+ auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
65
+ auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
66
+
67
+ // Create shared references to runtime and invoker for thread safety
68
+ auto runtimePtr = &runtime;
69
+ auto invoker = jsInvoker_;
70
+ auto selfPtr = shared_from_this();
71
+
72
+ // Launch background thread for model info loading
73
+ std::thread([selfPtr, path, resolve, reject, runtimePtr, invoker]() {
74
+ try {
75
+ // Initialize llama backend
76
+ llama_backend_init();
69
77
 
70
- // Get vocabulary
71
- const llama_vocab* vocab = llama_model_get_vocab(model);
72
- result.setProperty(runtime, "n_vocab", jsi::Value((double)llama_vocab_n_tokens(vocab)));
78
+ // Create model params
79
+ llama_model_params params = llama_model_default_params();
80
+ params.n_gpu_layers = 0; // Use CPU for model info loading
73
81
 
74
- // Get context size
75
- result.setProperty(runtime, "n_context", jsi::Value((double)llama_model_n_ctx_train(model)));
82
+ // Load the model
83
+ llama_model* model = llama_model_load_from_file(path.c_str(), params);
76
84
 
77
- // Get embedding size
78
- result.setProperty(runtime, "n_embd", jsi::Value((double)llama_model_n_embd(model)));
85
+ if (!model) {
86
+ throw std::runtime_error("Failed to load model from file: " + path);
87
+ }
79
88
 
80
- // Get model description
81
- char buf[512];
82
- llama_model_desc(model, buf, sizeof(buf));
83
- result.setProperty(runtime, "description",
84
- jsi::String::createFromUtf8(runtime, buf[0] ? buf : "Unknown model"));
89
+ // Get model information (native types)
90
+ double n_params = (double)llama_model_n_params(model);
91
+ const llama_vocab* vocab = llama_model_get_vocab(model);
92
+ double n_vocab = (double)llama_vocab_n_tokens(vocab);
93
+ double n_context = (double)llama_model_n_ctx_train(model);
94
+ double n_embd = (double)llama_model_n_embd(model);
95
+
96
+ // Get model description
97
+ char buf[512];
98
+ llama_model_desc(model, buf, sizeof(buf));
99
+ std::string description = buf[0] ? buf : "Unknown model";
100
+
101
+ // Check if GPU is supported
102
+ bool gpuSupported = llama_supports_gpu_offload();
103
+
104
+ // Calculate optimal GPU layers if GPU is supported
105
+ int optimalGpuLayers = 0;
106
+ if (gpuSupported) {
107
+ optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
108
+ }
85
109
 
86
- // Check if GPU is supported
87
- bool gpuSupported = llama_supports_gpu_offload();
88
- result.setProperty(runtime, "gpuSupported", jsi::Value(gpuSupported));
110
+ // Extract quantization type from model description
111
+ std::string desc(buf);
112
+ std::string quantType = "Unknown";
113
+ size_t qPos = desc.find(" Q");
114
+ if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
115
+ // Extract quantization string (like Q4_K, Q5_K, Q8_0)
116
+ quantType = desc.substr(qPos + 1, 4);
117
+ // Remove any trailing non-alphanumeric characters
118
+ quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
119
+ return std::isalnum(c);
120
+ }).base(), quantType.end());
121
+ }
89
122
 
90
- // Calculate optimal GPU layers if GPU is supported
91
- int optimalGpuLayers = 0;
92
- if (gpuSupported) {
93
- optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
94
- }
95
- result.setProperty(runtime, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
96
-
97
- // Extract quantization type from model description
98
- std::string desc(buf);
99
- std::string quantType = "Unknown";
100
- size_t qPos = desc.find(" Q");
101
- if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
102
- // Extract quantization string (like Q4_K, Q5_K, Q8_0)
103
- quantType = desc.substr(qPos + 1, 4);
104
- // Remove any trailing non-alphanumeric characters
105
- quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
106
- return std::isalnum(c);
107
- }).base(), quantType.end());
123
+ // Free the model
124
+ llama_model_free(model);
125
+
126
+ // Schedule success callback on JS thread to create JSI objects
127
+ invoker->invokeAsync([selfPtr, resolve, n_params, n_vocab, n_context, n_embd, description, gpuSupported, optimalGpuLayers, quantType, runtimePtr]() {
128
+ try {
129
+ // Create result object on JS thread
130
+ jsi::Object result(*runtimePtr);
131
+ result.setProperty(*runtimePtr, "n_params", jsi::Value(n_params));
132
+ result.setProperty(*runtimePtr, "n_vocab", jsi::Value(n_vocab));
133
+ result.setProperty(*runtimePtr, "n_context", jsi::Value(n_context));
134
+ result.setProperty(*runtimePtr, "n_embd", jsi::Value(n_embd));
135
+ result.setProperty(*runtimePtr, "description", jsi::String::createFromUtf8(*runtimePtr, description));
136
+ result.setProperty(*runtimePtr, "gpuSupported", jsi::Value(gpuSupported));
137
+ result.setProperty(*runtimePtr, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
138
+ result.setProperty(*runtimePtr, "quant_type", jsi::String::createFromUtf8(*runtimePtr, quantType));
139
+ result.setProperty(*runtimePtr, "architecture", jsi::String::createFromUtf8(*runtimePtr, "Unknown"));
140
+
141
+ resolve->call(*runtimePtr, result);
142
+ } catch (const std::exception& e) {
143
+ // If conversion fails, create a simple error response
144
+ jsi::Object errorObj(*runtimePtr);
145
+ errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
146
+ resolve->call(*runtimePtr, errorObj);
147
+ }
148
+ });
149
+
150
+ } catch (const std::exception& e) {
151
+ // Schedule error callback on JS thread
152
+ std::string errorMsg(e.what());
153
+ invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
154
+ try {
155
+ reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
156
+ } catch (...) {
157
+ // Ignore rejection errors
158
+ }
159
+ });
160
+ }
161
+ }).detach();
162
+
163
+ return jsi::Value::undefined();
108
164
  }
109
- result.setProperty(runtime, "quant_type", jsi::String::createFromUtf8(runtime, quantType));
110
-
111
- // Add architecture info
112
- result.setProperty(runtime, "architecture",
113
- jsi::String::createFromUtf8(runtime, "Unknown"));
114
-
115
- // Free the model
116
- llama_model_free(model);
117
-
118
- return result;
119
- } catch (const std::exception& e) {
120
- jsi::Object error(runtime);
121
- error.setProperty(runtime, "message", jsi::String::createFromUtf8(runtime, e.what()));
122
- throw jsi::JSError(runtime, error.getProperty(runtime, "message").asString(runtime));
123
- }
165
+ );
166
+
167
+ return Promise.callAsConstructor(runtime, std::move(executor));
124
168
  }
125
169
 
126
170
  jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
127
- std::lock_guard<std::mutex> lock(mutex_);
128
-
129
- try {
130
- // Get model path - required (preserve custom path handling)
131
- if (!options.hasProperty(runtime, "model")) {
132
- throw std::runtime_error("model path is required");
133
- }
134
-
135
- // Initialize llama backend
136
- llama_backend_init();
137
-
138
- std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
139
- SystemUtils::normalizeFilePath(model_path);
140
-
141
- // Initialize params with defaults
142
- rn_common_params params;
143
-
144
- // Set default sampling parameters
145
- params.sampling = common_params_sampling();
146
-
147
- // Set model path
148
- params.model.path = model_path;
149
-
150
- // Override defaults with user settings if provided
151
- SystemUtils::setIfExists(runtime, options, "n_ctx", params.n_ctx);
152
- SystemUtils::setIfExists(runtime, options, "n_batch", params.n_batch);
153
- SystemUtils::setIfExists(runtime, options, "n_ubatch", params.n_ubatch);
154
- SystemUtils::setIfExists(runtime, options, "n_keep", params.n_keep);
155
-
156
- // Memory and resource options - MUST respect user settings
157
- SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
158
- SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
159
- SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
160
-
161
- // Extract threading parameters (preserve custom thread logic)
162
- int n_threads = 0; // 0 = auto
163
- if (options.hasProperty(runtime, "n_threads")) {
164
- n_threads = options.getProperty(runtime, "n_threads").asNumber();
165
- } else {
166
- n_threads = SystemUtils::getOptimalThreadCount();
167
- }
168
- params.cpuparams.n_threads = n_threads;
169
-
170
- // Set n_gpu_layers (preserve custom GPU logic)
171
- int n_gpu_layers = 0;
172
- bool gpuSupported = llama_supports_gpu_offload();
173
- if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
174
- n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
175
- }
176
- params.n_gpu_layers = n_gpu_layers;
177
-
178
- // Additional model parameters
179
- SystemUtils::setIfExists(runtime, options, "logits_file", params.logits_file);
180
- SystemUtils::setIfExists(runtime, options, "embedding", params.embedding);
181
- SystemUtils::setIfExists(runtime, options, "rope_freq_base", params.rope_freq_base);
182
- SystemUtils::setIfExists(runtime, options, "rope_freq_scale", params.rope_freq_scale);
183
-
184
- // Sampling parameters
185
- SystemUtils::setIfExists(runtime, options, "seed", params.sampling.seed);
171
+ // Parse JSI arguments to native types on JSI thread
172
+ if (!options.hasProperty(runtime, "model")) {
173
+ throw jsi::JSError(runtime, "model path is required");
174
+ }
186
175
 
187
- // Other system parameters
188
- SystemUtils::setIfExists(runtime, options, "verbose", params.verbosity);
176
+ if (!jsInvoker_) {
177
+ // Fallback to synchronous if no CallInvoker available - this should not happen normally
178
+ throw jsi::JSError(runtime, "CallInvoker not available for async operation");
179
+ }
189
180
 
190
- // RoPE settings if provided
191
- if (options.hasProperty(runtime, "yarn_ext_factor")) {
192
- params.yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
193
- }
194
- if (options.hasProperty(runtime, "yarn_attn_factor")) {
195
- params.yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
196
- }
197
- if (options.hasProperty(runtime, "yarn_beta_fast")) {
198
- params.yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
199
- }
200
- if (options.hasProperty(runtime, "yarn_beta_slow")) {
201
- params.yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
202
- }
181
+ // Parse all options to native types on JSI thread
182
+ std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
183
+ SystemUtils::normalizeFilePath(model_path);
184
+
185
+ // Parse all numeric/boolean options to native types
186
+ int n_ctx = 2048; // defaults
187
+ int n_batch = 512;
188
+ int n_ubatch = 512;
189
+ int n_keep = 0;
190
+ bool use_mmap = true;
191
+ bool use_mlock = false;
192
+ bool use_jinja = false;
193
+ bool embedding = false;
194
+ int n_threads = 0;
195
+ int n_gpu_layers = 0;
196
+ std::string logits_file;
197
+ float rope_freq_base = 10000.0f;
198
+ float rope_freq_scale = 1.0f;
199
+ uint32_t seed = 4294967295U; // default seed
200
+ int verbosity = 0;
201
+ float yarn_ext_factor = 1.0f;
202
+ float yarn_attn_factor = 1.0f;
203
+ float yarn_beta_fast = 32.0f;
204
+ float yarn_beta_slow = 1.0f;
205
+ std::string chat_template;
206
+
207
+ // Parse options to native types
208
+ SystemUtils::setIfExists(runtime, options, "n_ctx", n_ctx);
209
+ SystemUtils::setIfExists(runtime, options, "n_batch", n_batch);
210
+ SystemUtils::setIfExists(runtime, options, "n_ubatch", n_ubatch);
211
+ SystemUtils::setIfExists(runtime, options, "n_keep", n_keep);
212
+ SystemUtils::setIfExists(runtime, options, "use_mmap", use_mmap);
213
+ SystemUtils::setIfExists(runtime, options, "use_mlock", use_mlock);
214
+ SystemUtils::setIfExists(runtime, options, "use_jinja", use_jinja);
215
+ SystemUtils::setIfExists(runtime, options, "embedding", embedding);
216
+ SystemUtils::setIfExists(runtime, options, "rope_freq_base", rope_freq_base);
217
+ SystemUtils::setIfExists(runtime, options, "rope_freq_scale", rope_freq_scale);
218
+ SystemUtils::setIfExists(runtime, options, "seed", seed);
219
+ SystemUtils::setIfExists(runtime, options, "verbose", verbosity);
220
+ SystemUtils::setIfExists(runtime, options, "logits_file", logits_file);
221
+ SystemUtils::setIfExists(runtime, options, "chat_template", chat_template);
222
+
223
+ if (options.hasProperty(runtime, "n_threads")) {
224
+ n_threads = options.getProperty(runtime, "n_threads").asNumber();
225
+ } else {
226
+ n_threads = SystemUtils::getOptimalThreadCount();
227
+ }
203
228
 
204
- // Support for chat template override
205
- std::string chat_template;
206
- if (SystemUtils::setIfExists(runtime, options, "chat_template", chat_template)) {
207
- params.chat_template = chat_template;
208
- }
229
+ bool gpuSupported = llama_supports_gpu_offload();
230
+ if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
231
+ n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
232
+ }
209
233
 
210
- // Support for LoRA adapters
211
- if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
212
- jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
213
- if (lora_obj.isArray(runtime)) {
214
- jsi::Array lora_array = lora_obj.asArray(runtime);
215
- size_t n_lora = lora_array.size(runtime);
216
-
217
- for (size_t i = 0; i < n_lora; i++) {
218
- if (lora_array.getValueAtIndex(runtime, i).isObject()) {
219
- jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
220
- if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
221
- common_adapter_lora_info lora;
222
- lora.path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
223
-
224
- // Get scale if provided
225
- lora.scale = 1.0f; // Default scale
226
- if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
227
- lora.scale = adapter.getProperty(runtime, "scale").asNumber();
228
- }
234
+ if (options.hasProperty(runtime, "yarn_ext_factor")) {
235
+ yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
236
+ }
237
+ if (options.hasProperty(runtime, "yarn_attn_factor")) {
238
+ yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
239
+ }
240
+ if (options.hasProperty(runtime, "yarn_beta_fast")) {
241
+ yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
242
+ }
243
+ if (options.hasProperty(runtime, "yarn_beta_slow")) {
244
+ yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
245
+ }
229
246
 
230
- params.lora_adapters.push_back(lora);
247
+ // Parse LoRA adapters to native structure
248
+ std::vector<std::pair<std::string, float>> lora_adapters;
249
+ if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
250
+ jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
251
+ if (lora_obj.isArray(runtime)) {
252
+ jsi::Array lora_array = lora_obj.asArray(runtime);
253
+ size_t n_lora = lora_array.size(runtime);
254
+
255
+ for (size_t i = 0; i < n_lora; i++) {
256
+ if (lora_array.getValueAtIndex(runtime, i).isObject()) {
257
+ jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
258
+ if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
259
+ std::string lora_path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
260
+ float lora_scale = 1.0f; // Default scale
261
+ if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
262
+ lora_scale = adapter.getProperty(runtime, "scale").asNumber();
231
263
  }
264
+ lora_adapters.emplace_back(lora_path, lora_scale);
232
265
  }
233
266
  }
234
267
  }
235
268
  }
269
+ }
236
270
 
237
- // Initialize using common_init_from_params
238
- common_init_result result;
239
-
240
- try {
241
- result = common_init_from_params(params);
271
+ // Create Promise constructor
272
+ auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
273
+
274
+ auto executor = jsi::Function::createFromHostFunction(
275
+ runtime,
276
+ jsi::PropNameID::forAscii(runtime, "executor"),
277
+ 2,
278
+ [this, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
242
279
 
243
- // Check if initialization was successful
244
- if (!result.model || !result.context) {
245
- throw std::runtime_error("Failed to initialize model and context");
246
- }
247
- } catch (const std::exception& e) {
248
- // If we were trying to use GPU and got a Vulkan/shader error, retry with CPU-only
249
- if (params.n_gpu_layers > 0) {
250
- // Other GPU error, still try CPU fallback
251
- fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
252
-
253
- params.n_gpu_layers = 0;
254
-
280
+ auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
281
+ auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
282
+
283
+ // Create shared references to runtime and invoker for thread safety
284
+ auto runtimePtr = &runtime;
285
+ auto invoker = jsInvoker_;
286
+ auto selfPtr = shared_from_this();
287
+
288
+ // Launch background thread for model initialization
289
+ std::thread([selfPtr, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters, resolve, reject, runtimePtr, invoker]() {
255
290
  try {
256
- result = common_init_from_params(params);
291
+ // Thread-safe access to member variables
292
+ std::lock_guard<std::mutex> lock(selfPtr->mutex_);
293
+
294
+ // Initialize llama backend
295
+ llama_backend_init();
296
+
297
+ // Initialize params with defaults
298
+ rn_common_params params;
299
+
300
+ // Set default sampling parameters
301
+ params.sampling = common_params_sampling();
302
+
303
+ // Set all parsed native values
304
+ params.model.path = model_path;
305
+ params.n_ctx = n_ctx;
306
+ params.n_batch = n_batch;
307
+ params.n_ubatch = n_ubatch;
308
+ params.n_keep = n_keep;
309
+ params.use_mmap = use_mmap;
310
+ params.use_mlock = use_mlock;
311
+ params.use_jinja = use_jinja;
312
+ params.embedding = embedding;
313
+ params.cpuparams.n_threads = n_threads;
314
+ params.n_gpu_layers = n_gpu_layers;
315
+ params.logits_file = logits_file;
316
+ params.rope_freq_base = rope_freq_base;
317
+ params.rope_freq_scale = rope_freq_scale;
318
+ params.sampling.seed = seed;
319
+ params.verbosity = verbosity;
320
+ params.yarn_ext_factor = yarn_ext_factor;
321
+ params.yarn_attn_factor = yarn_attn_factor;
322
+ params.yarn_beta_fast = yarn_beta_fast;
323
+ params.yarn_beta_slow = yarn_beta_slow;
324
+
325
+ if (!chat_template.empty()) {
326
+ params.chat_template = chat_template;
327
+ }
328
+
329
+ // Add LoRA adapters
330
+ for (const auto& lora : lora_adapters) {
331
+ common_adapter_lora_info lora_info;
332
+ lora_info.path = lora.first;
333
+ lora_info.scale = lora.second;
334
+ params.lora_adapters.push_back(lora_info);
335
+ }
336
+
337
+ // Initialize using common_init_from_params
338
+ common_init_result result;
257
339
 
258
- if (!result.model || !result.context) {
259
- throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
340
+ try {
341
+ result = common_init_from_params(params);
342
+
343
+ // Check if initialization was successful
344
+ if (!result.model || !result.context) {
345
+ throw std::runtime_error("Failed to initialize model and context");
346
+ }
347
+ } catch (const std::exception& e) {
348
+ // If we were trying to use GPU and got an error, retry with CPU-only
349
+ if (params.n_gpu_layers > 0) {
350
+ fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
351
+
352
+ params.n_gpu_layers = 0;
353
+
354
+ try {
355
+ result = common_init_from_params(params);
356
+
357
+ if (!result.model || !result.context) {
358
+ throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
359
+ }
360
+
361
+ fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
362
+ } catch (const std::exception& cpu_e) {
363
+ throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
364
+ }
365
+ } else {
366
+ // Was already CPU-only, re-throw the original error
367
+ throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
368
+ }
369
+ }
370
+
371
+ // Create and initialize rn_llama_context
372
+ selfPtr->rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
373
+ selfPtr->rn_ctx_->model = result.model.release();
374
+ selfPtr->rn_ctx_->ctx = result.context.release();
375
+ selfPtr->rn_ctx_->model_loaded = true;
376
+ selfPtr->rn_ctx_->vocab = llama_model_get_vocab(selfPtr->rn_ctx_->model);
377
+
378
+ // Create a rn_common_params from the common_params
379
+ rn_common_params rn_params;
380
+ // Copy the base class fields
381
+ static_cast<common_params&>(rn_params) = params;
382
+ // Set additional fields
383
+ rn_params.use_jinja = params.use_jinja;
384
+ rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
385
+ // Now assign to the context
386
+ selfPtr->rn_ctx_->params = rn_params;
387
+
388
+ selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
389
+ try {
390
+ common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja);
391
+ } catch (const std::exception & e) {
392
+ // Fallback to chatml if the original template parsing fails
393
+ selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
260
394
  }
395
+
396
+ // Schedule success callback on JS thread to create JSI objects
397
+ invoker->invokeAsync([selfPtr, resolve, runtimePtr]() {
398
+ try {
399
+ // Create the model object and resolve Promise on JS thread
400
+ jsi::Object modelObject = selfPtr->createModelObject(*runtimePtr, selfPtr->rn_ctx_.get());
401
+ resolve->call(*runtimePtr, modelObject);
402
+ } catch (const std::exception& e) {
403
+ // If conversion fails, create a simple error response
404
+ jsi::Object errorObj(*runtimePtr);
405
+ errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
406
+ resolve->call(*runtimePtr, errorObj);
407
+ }
408
+ });
261
409
 
262
- fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
263
- } catch (const std::exception& cpu_e) {
264
- throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
410
+ } catch (const std::exception& e) {
411
+ // Schedule error callback on JS thread
412
+ std::string errorMsg(e.what());
413
+ fprintf(stderr, "initLlama error: %s\n", errorMsg.c_str());
414
+ invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
415
+ try {
416
+ reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
417
+ } catch (...) {
418
+ // Ignore rejection errors
419
+ }
420
+ });
265
421
  }
266
- } else {
267
- // Was already CPU-only, re-throw the original error
268
- throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
269
- }
270
- }
271
-
272
- // Create and initialize rn_llama_context
273
- rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
274
- rn_ctx_->model = result.model.release();
275
- rn_ctx_->ctx = result.context.release();
276
- rn_ctx_->model_loaded = true;
277
- rn_ctx_->vocab = llama_model_get_vocab(rn_ctx_->model);
278
-
279
- // Create a rn_common_params from the common_params
280
- rn_common_params rn_params;
281
- // Copy the base class fields
282
- static_cast<common_params&>(rn_params) = params;
283
- // Set additional fields
284
- rn_params.use_jinja = params.use_jinja;
285
- rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
286
- // Don't force a specific chat format - let the template system auto-detect based on model and tools
287
- // rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
288
- // Now assign to the context
289
- rn_ctx_->params = rn_params;
290
-
291
- rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, params.chat_template);
292
- try {
293
- common_chat_format_example(rn_ctx_->chat_templates.get(), params.use_jinja);
294
- } catch (const std::exception & e) {
295
- // Fallback to chatml if the original template parsing fails
296
- rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, "chatml");
422
+ }).detach();
423
+
424
+ return jsi::Value::undefined();
297
425
  }
298
-
299
-
300
- // Create the model object and return it
301
- return createModelObject(runtime, rn_ctx_.get());
302
- } catch (const std::exception& e) {
303
- // We can keep this top-level error log as it's for initialization failure
304
- fprintf(stderr, "initLlama error: %s\n", e.what());
305
- throw jsi::JSError(runtime, e.what());
306
- }
426
+ );
427
+
428
+ return Promise.callAsConstructor(runtime, std::move(executor));
307
429
  }
308
430
 
309
431
  jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {
package/cpp/PureCppImpl.h CHANGED
@@ -28,7 +28,7 @@ class LlamaCppModel; // Forward declare LlamaCppModel
28
28
  namespace facebook::react {
29
29
 
30
30
  // Note: The class name is PureCppImpl, and it derives from your project's C++ spec
31
- class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl> {
31
+ class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl>, public std::enable_shared_from_this<PureCppImpl> {
32
32
  public:
33
33
  // Constructor
34
34
  PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@novastera-oss/llamarn",
3
- "version": "0.2.3",
3
+ "version": "0.2.4",
4
4
  "description": "An attempt at a pure cpp turbo module library",
5
5
  "source": "./src/index.tsx",
6
6
  "main": "./lib/module/index.js",
@@ -234,7 +234,7 @@ export interface Spec extends TurboModule {
234
234
  // Initialize a Llama context with the given model parameters
235
235
  initLlama(params: LlamaModelParams): Promise<LlamaContextType & LlamaContextMethods>;
236
236
 
237
- // Load model info without creating a full contex
237
+ // Load model info without creating a full context
238
238
  loadLlamaModelInfo(modelPath: string): Promise<{
239
239
  n_params: number;
240
240
  n_vocab: number;