@novastera-oss/llamarn 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/PureCppImpl.cpp +351 -247
- package/cpp/PureCppImpl.h +1 -1
- package/cpp/SystemUtils.h +2 -2
- package/cpp/rn-completion.cpp +56 -3
- package/package.json +1 -1
- package/src/NativeRNLlamaCpp.ts +1 -1
package/cpp/PureCppImpl.cpp
CHANGED
|
@@ -43,285 +43,389 @@ double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String modelPath) {
|
|
46
|
+
// Parse JSI arguments to native types on JSI thread
|
|
46
47
|
std::string path = modelPath.utf8(runtime);
|
|
47
48
|
SystemUtils::normalizeFilePath(path);
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
//
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
// Create model params
|
|
54
|
-
llama_model_params params = llama_model_default_params();
|
|
55
|
-
params.n_gpu_layers = 0; // Use CPU for model info loading
|
|
56
|
-
|
|
57
|
-
// Load the model
|
|
58
|
-
llama_model* model = llama_model_load_from_file(path.c_str(), params);
|
|
59
|
-
|
|
60
|
-
if (!model) {
|
|
61
|
-
throw std::runtime_error("Failed to load model from file: " + path);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Create result object
|
|
65
|
-
jsi::Object result(runtime);
|
|
50
|
+
if (!jsInvoker_) {
|
|
51
|
+
// Fallback to synchronous if no CallInvoker available - this should not happen normally
|
|
52
|
+
throw jsi::JSError(runtime, "CallInvoker not available for async operation");
|
|
53
|
+
}
|
|
66
54
|
|
|
67
|
-
|
|
68
|
-
|
|
55
|
+
// Create Promise constructor
|
|
56
|
+
auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
|
|
57
|
+
|
|
58
|
+
auto executor = jsi::Function::createFromHostFunction(
|
|
59
|
+
runtime,
|
|
60
|
+
jsi::PropNameID::forAscii(runtime, "executor"),
|
|
61
|
+
2,
|
|
62
|
+
[this, path](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
|
|
63
|
+
|
|
64
|
+
auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
|
|
65
|
+
auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
|
|
66
|
+
|
|
67
|
+
// Create shared references to runtime and invoker for thread safety
|
|
68
|
+
auto runtimePtr = &runtime;
|
|
69
|
+
auto invoker = jsInvoker_;
|
|
70
|
+
auto selfPtr = shared_from_this();
|
|
71
|
+
|
|
72
|
+
// Launch background thread for model info loading
|
|
73
|
+
std::thread([selfPtr, path, resolve, reject, runtimePtr, invoker]() {
|
|
74
|
+
try {
|
|
75
|
+
// Initialize llama backend
|
|
76
|
+
llama_backend_init();
|
|
69
77
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
78
|
+
// Create model params
|
|
79
|
+
llama_model_params params = llama_model_default_params();
|
|
80
|
+
params.n_gpu_layers = 0; // Use CPU for model info loading
|
|
73
81
|
|
|
74
|
-
|
|
75
|
-
|
|
82
|
+
// Load the model
|
|
83
|
+
llama_model* model = llama_model_load_from_file(path.c_str(), params);
|
|
76
84
|
|
|
77
|
-
|
|
78
|
-
|
|
85
|
+
if (!model) {
|
|
86
|
+
throw std::runtime_error("Failed to load model from file: " + path);
|
|
87
|
+
}
|
|
79
88
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
89
|
+
// Get model information (native types)
|
|
90
|
+
double n_params = (double)llama_model_n_params(model);
|
|
91
|
+
const llama_vocab* vocab = llama_model_get_vocab(model);
|
|
92
|
+
double n_vocab = (double)llama_vocab_n_tokens(vocab);
|
|
93
|
+
double n_context = (double)llama_model_n_ctx_train(model);
|
|
94
|
+
double n_embd = (double)llama_model_n_embd(model);
|
|
95
|
+
|
|
96
|
+
// Get model description
|
|
97
|
+
char buf[512];
|
|
98
|
+
llama_model_desc(model, buf, sizeof(buf));
|
|
99
|
+
std::string description = buf[0] ? buf : "Unknown model";
|
|
100
|
+
|
|
101
|
+
// Check if GPU is supported
|
|
102
|
+
bool gpuSupported = llama_supports_gpu_offload();
|
|
103
|
+
|
|
104
|
+
// Calculate optimal GPU layers if GPU is supported
|
|
105
|
+
int optimalGpuLayers = 0;
|
|
106
|
+
if (gpuSupported) {
|
|
107
|
+
optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
|
|
108
|
+
}
|
|
85
109
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
110
|
+
// Extract quantization type from model description
|
|
111
|
+
std::string desc(buf);
|
|
112
|
+
std::string quantType = "Unknown";
|
|
113
|
+
size_t qPos = desc.find(" Q");
|
|
114
|
+
if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
|
|
115
|
+
// Extract quantization string (like Q4_K, Q5_K, Q8_0)
|
|
116
|
+
quantType = desc.substr(qPos + 1, 4);
|
|
117
|
+
// Remove any trailing non-alphanumeric characters
|
|
118
|
+
quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
|
|
119
|
+
return std::isalnum(c);
|
|
120
|
+
}).base(), quantType.end());
|
|
121
|
+
}
|
|
89
122
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
123
|
+
// Free the model
|
|
124
|
+
llama_model_free(model);
|
|
125
|
+
|
|
126
|
+
// Schedule success callback on JS thread to create JSI objects
|
|
127
|
+
invoker->invokeAsync([selfPtr, resolve, n_params, n_vocab, n_context, n_embd, description, gpuSupported, optimalGpuLayers, quantType, runtimePtr]() {
|
|
128
|
+
try {
|
|
129
|
+
// Create result object on JS thread
|
|
130
|
+
jsi::Object result(*runtimePtr);
|
|
131
|
+
result.setProperty(*runtimePtr, "n_params", jsi::Value(n_params));
|
|
132
|
+
result.setProperty(*runtimePtr, "n_vocab", jsi::Value(n_vocab));
|
|
133
|
+
result.setProperty(*runtimePtr, "n_context", jsi::Value(n_context));
|
|
134
|
+
result.setProperty(*runtimePtr, "n_embd", jsi::Value(n_embd));
|
|
135
|
+
result.setProperty(*runtimePtr, "description", jsi::String::createFromUtf8(*runtimePtr, description));
|
|
136
|
+
result.setProperty(*runtimePtr, "gpuSupported", jsi::Value(gpuSupported));
|
|
137
|
+
result.setProperty(*runtimePtr, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
|
|
138
|
+
result.setProperty(*runtimePtr, "quant_type", jsi::String::createFromUtf8(*runtimePtr, quantType));
|
|
139
|
+
result.setProperty(*runtimePtr, "architecture", jsi::String::createFromUtf8(*runtimePtr, "Unknown"));
|
|
140
|
+
|
|
141
|
+
resolve->call(*runtimePtr, result);
|
|
142
|
+
} catch (const std::exception& e) {
|
|
143
|
+
// If conversion fails, create a simple error response
|
|
144
|
+
jsi::Object errorObj(*runtimePtr);
|
|
145
|
+
errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
|
|
146
|
+
resolve->call(*runtimePtr, errorObj);
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
} catch (const std::exception& e) {
|
|
151
|
+
// Schedule error callback on JS thread
|
|
152
|
+
std::string errorMsg(e.what());
|
|
153
|
+
invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
|
|
154
|
+
try {
|
|
155
|
+
reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
|
|
156
|
+
} catch (...) {
|
|
157
|
+
// Ignore rejection errors
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
}).detach();
|
|
162
|
+
|
|
163
|
+
return jsi::Value::undefined();
|
|
108
164
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
result.setProperty(runtime, "architecture",
|
|
113
|
-
jsi::String::createFromUtf8(runtime, "Unknown"));
|
|
114
|
-
|
|
115
|
-
// Free the model
|
|
116
|
-
llama_model_free(model);
|
|
117
|
-
|
|
118
|
-
return result;
|
|
119
|
-
} catch (const std::exception& e) {
|
|
120
|
-
jsi::Object error(runtime);
|
|
121
|
-
error.setProperty(runtime, "message", jsi::String::createFromUtf8(runtime, e.what()));
|
|
122
|
-
throw jsi::JSError(runtime, error.getProperty(runtime, "message").asString(runtime));
|
|
123
|
-
}
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
return Promise.callAsConstructor(runtime, std::move(executor));
|
|
124
168
|
}
|
|
125
169
|
|
|
126
170
|
jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
if (!options.hasProperty(runtime, "model")) {
|
|
132
|
-
throw std::runtime_error("model path is required");
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
// Initialize llama backend
|
|
136
|
-
llama_backend_init();
|
|
137
|
-
|
|
138
|
-
std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
|
|
139
|
-
SystemUtils::normalizeFilePath(model_path);
|
|
140
|
-
|
|
141
|
-
// Initialize params with defaults
|
|
142
|
-
rn_common_params params;
|
|
143
|
-
|
|
144
|
-
// Set default sampling parameters
|
|
145
|
-
params.sampling = common_params_sampling();
|
|
146
|
-
|
|
147
|
-
// Set model path
|
|
148
|
-
params.model.path = model_path;
|
|
149
|
-
|
|
150
|
-
// Override defaults with user settings if provided
|
|
151
|
-
SystemUtils::setIfExists(runtime, options, "n_ctx", params.n_ctx);
|
|
152
|
-
SystemUtils::setIfExists(runtime, options, "n_batch", params.n_batch);
|
|
153
|
-
SystemUtils::setIfExists(runtime, options, "n_ubatch", params.n_ubatch);
|
|
154
|
-
SystemUtils::setIfExists(runtime, options, "n_keep", params.n_keep);
|
|
155
|
-
|
|
156
|
-
// Memory and resource options - MUST respect user settings
|
|
157
|
-
SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
|
|
158
|
-
SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
|
|
159
|
-
SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
|
|
160
|
-
|
|
161
|
-
// Extract threading parameters (preserve custom thread logic)
|
|
162
|
-
int n_threads = 0; // 0 = auto
|
|
163
|
-
if (options.hasProperty(runtime, "n_threads")) {
|
|
164
|
-
n_threads = options.getProperty(runtime, "n_threads").asNumber();
|
|
165
|
-
} else {
|
|
166
|
-
n_threads = SystemUtils::getOptimalThreadCount();
|
|
167
|
-
}
|
|
168
|
-
params.cpuparams.n_threads = n_threads;
|
|
169
|
-
|
|
170
|
-
// Set n_gpu_layers (preserve custom GPU logic)
|
|
171
|
-
int n_gpu_layers = 0;
|
|
172
|
-
bool gpuSupported = llama_supports_gpu_offload();
|
|
173
|
-
if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
|
|
174
|
-
n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
|
|
175
|
-
}
|
|
176
|
-
params.n_gpu_layers = n_gpu_layers;
|
|
177
|
-
|
|
178
|
-
// Additional model parameters
|
|
179
|
-
SystemUtils::setIfExists(runtime, options, "logits_file", params.logits_file);
|
|
180
|
-
SystemUtils::setIfExists(runtime, options, "embedding", params.embedding);
|
|
181
|
-
SystemUtils::setIfExists(runtime, options, "rope_freq_base", params.rope_freq_base);
|
|
182
|
-
SystemUtils::setIfExists(runtime, options, "rope_freq_scale", params.rope_freq_scale);
|
|
183
|
-
|
|
184
|
-
// Sampling parameters
|
|
185
|
-
SystemUtils::setIfExists(runtime, options, "seed", params.sampling.seed);
|
|
171
|
+
// Parse JSI arguments to native types on JSI thread
|
|
172
|
+
if (!options.hasProperty(runtime, "model")) {
|
|
173
|
+
throw jsi::JSError(runtime, "model path is required");
|
|
174
|
+
}
|
|
186
175
|
|
|
187
|
-
|
|
188
|
-
|
|
176
|
+
if (!jsInvoker_) {
|
|
177
|
+
// Fallback to synchronous if no CallInvoker available - this should not happen normally
|
|
178
|
+
throw jsi::JSError(runtime, "CallInvoker not available for async operation");
|
|
179
|
+
}
|
|
189
180
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
181
|
+
// Parse all options to native types on JSI thread
|
|
182
|
+
std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
|
|
183
|
+
SystemUtils::normalizeFilePath(model_path);
|
|
184
|
+
|
|
185
|
+
// Parse all numeric/boolean options to native types
|
|
186
|
+
int n_ctx = 2048; // defaults
|
|
187
|
+
int n_batch = 512;
|
|
188
|
+
int n_ubatch = 512;
|
|
189
|
+
int n_keep = 0;
|
|
190
|
+
bool use_mmap = true;
|
|
191
|
+
bool use_mlock = false;
|
|
192
|
+
bool use_jinja = false;
|
|
193
|
+
bool embedding = false;
|
|
194
|
+
int n_threads = 0;
|
|
195
|
+
int n_gpu_layers = 0;
|
|
196
|
+
std::string logits_file;
|
|
197
|
+
float rope_freq_base = 10000.0f;
|
|
198
|
+
float rope_freq_scale = 1.0f;
|
|
199
|
+
uint32_t seed = 4294967295U; // default seed
|
|
200
|
+
int verbosity = 0;
|
|
201
|
+
float yarn_ext_factor = 1.0f;
|
|
202
|
+
float yarn_attn_factor = 1.0f;
|
|
203
|
+
float yarn_beta_fast = 32.0f;
|
|
204
|
+
float yarn_beta_slow = 1.0f;
|
|
205
|
+
std::string chat_template;
|
|
206
|
+
|
|
207
|
+
// Parse options to native types
|
|
208
|
+
SystemUtils::setIfExists(runtime, options, "n_ctx", n_ctx);
|
|
209
|
+
SystemUtils::setIfExists(runtime, options, "n_batch", n_batch);
|
|
210
|
+
SystemUtils::setIfExists(runtime, options, "n_ubatch", n_ubatch);
|
|
211
|
+
SystemUtils::setIfExists(runtime, options, "n_keep", n_keep);
|
|
212
|
+
SystemUtils::setIfExists(runtime, options, "use_mmap", use_mmap);
|
|
213
|
+
SystemUtils::setIfExists(runtime, options, "use_mlock", use_mlock);
|
|
214
|
+
SystemUtils::setIfExists(runtime, options, "use_jinja", use_jinja);
|
|
215
|
+
SystemUtils::setIfExists(runtime, options, "embedding", embedding);
|
|
216
|
+
SystemUtils::setIfExists(runtime, options, "rope_freq_base", rope_freq_base);
|
|
217
|
+
SystemUtils::setIfExists(runtime, options, "rope_freq_scale", rope_freq_scale);
|
|
218
|
+
SystemUtils::setIfExists(runtime, options, "seed", seed);
|
|
219
|
+
SystemUtils::setIfExists(runtime, options, "verbose", verbosity);
|
|
220
|
+
SystemUtils::setIfExists(runtime, options, "logits_file", logits_file);
|
|
221
|
+
SystemUtils::setIfExists(runtime, options, "chat_template", chat_template);
|
|
222
|
+
|
|
223
|
+
if (options.hasProperty(runtime, "n_threads")) {
|
|
224
|
+
n_threads = options.getProperty(runtime, "n_threads").asNumber();
|
|
225
|
+
} else {
|
|
226
|
+
n_threads = SystemUtils::getOptimalThreadCount();
|
|
227
|
+
}
|
|
203
228
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
}
|
|
229
|
+
bool gpuSupported = llama_supports_gpu_offload();
|
|
230
|
+
if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
|
|
231
|
+
n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
|
|
232
|
+
}
|
|
209
233
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
lora.path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
|
|
223
|
-
|
|
224
|
-
// Get scale if provided
|
|
225
|
-
lora.scale = 1.0f; // Default scale
|
|
226
|
-
if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
|
|
227
|
-
lora.scale = adapter.getProperty(runtime, "scale").asNumber();
|
|
228
|
-
}
|
|
234
|
+
if (options.hasProperty(runtime, "yarn_ext_factor")) {
|
|
235
|
+
yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
|
|
236
|
+
}
|
|
237
|
+
if (options.hasProperty(runtime, "yarn_attn_factor")) {
|
|
238
|
+
yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
|
|
239
|
+
}
|
|
240
|
+
if (options.hasProperty(runtime, "yarn_beta_fast")) {
|
|
241
|
+
yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
|
|
242
|
+
}
|
|
243
|
+
if (options.hasProperty(runtime, "yarn_beta_slow")) {
|
|
244
|
+
yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
|
|
245
|
+
}
|
|
229
246
|
|
|
230
|
-
|
|
247
|
+
// Parse LoRA adapters to native structure
|
|
248
|
+
std::vector<std::pair<std::string, float>> lora_adapters;
|
|
249
|
+
if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
|
|
250
|
+
jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
|
|
251
|
+
if (lora_obj.isArray(runtime)) {
|
|
252
|
+
jsi::Array lora_array = lora_obj.asArray(runtime);
|
|
253
|
+
size_t n_lora = lora_array.size(runtime);
|
|
254
|
+
|
|
255
|
+
for (size_t i = 0; i < n_lora; i++) {
|
|
256
|
+
if (lora_array.getValueAtIndex(runtime, i).isObject()) {
|
|
257
|
+
jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
|
|
258
|
+
if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
|
|
259
|
+
std::string lora_path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
|
|
260
|
+
float lora_scale = 1.0f; // Default scale
|
|
261
|
+
if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
|
|
262
|
+
lora_scale = adapter.getProperty(runtime, "scale").asNumber();
|
|
231
263
|
}
|
|
264
|
+
lora_adapters.emplace_back(lora_path, lora_scale);
|
|
232
265
|
}
|
|
233
266
|
}
|
|
234
267
|
}
|
|
235
268
|
}
|
|
269
|
+
}
|
|
236
270
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
271
|
+
// Create Promise constructor
|
|
272
|
+
auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
|
|
273
|
+
|
|
274
|
+
auto executor = jsi::Function::createFromHostFunction(
|
|
275
|
+
runtime,
|
|
276
|
+
jsi::PropNameID::forAscii(runtime, "executor"),
|
|
277
|
+
2,
|
|
278
|
+
[this, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
|
|
242
279
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
params.n_gpu_layers = 0;
|
|
254
|
-
|
|
280
|
+
auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
|
|
281
|
+
auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
|
|
282
|
+
|
|
283
|
+
// Create shared references to runtime and invoker for thread safety
|
|
284
|
+
auto runtimePtr = &runtime;
|
|
285
|
+
auto invoker = jsInvoker_;
|
|
286
|
+
auto selfPtr = shared_from_this();
|
|
287
|
+
|
|
288
|
+
// Launch background thread for model initialization
|
|
289
|
+
std::thread([selfPtr, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters, resolve, reject, runtimePtr, invoker]() {
|
|
255
290
|
try {
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
291
|
+
// Thread-safe access to member variables
|
|
292
|
+
std::lock_guard<std::mutex> lock(selfPtr->mutex_);
|
|
293
|
+
|
|
294
|
+
// Initialize llama backend
|
|
295
|
+
llama_backend_init();
|
|
296
|
+
|
|
297
|
+
// Initialize params with defaults
|
|
298
|
+
rn_common_params params;
|
|
299
|
+
|
|
300
|
+
// Set default sampling parameters
|
|
301
|
+
params.sampling = common_params_sampling();
|
|
302
|
+
|
|
303
|
+
// Set all parsed native values
|
|
304
|
+
params.model.path = model_path;
|
|
305
|
+
params.n_ctx = n_ctx;
|
|
306
|
+
params.n_batch = n_batch;
|
|
307
|
+
params.n_ubatch = n_ubatch;
|
|
308
|
+
params.n_keep = n_keep;
|
|
309
|
+
params.use_mmap = use_mmap;
|
|
310
|
+
params.use_mlock = use_mlock;
|
|
311
|
+
params.use_jinja = use_jinja;
|
|
312
|
+
params.embedding = embedding;
|
|
313
|
+
params.cpuparams.n_threads = n_threads;
|
|
314
|
+
params.n_gpu_layers = n_gpu_layers;
|
|
315
|
+
params.logits_file = logits_file;
|
|
316
|
+
params.rope_freq_base = rope_freq_base;
|
|
317
|
+
params.rope_freq_scale = rope_freq_scale;
|
|
318
|
+
params.sampling.seed = seed;
|
|
319
|
+
params.verbosity = verbosity;
|
|
320
|
+
params.yarn_ext_factor = yarn_ext_factor;
|
|
321
|
+
params.yarn_attn_factor = yarn_attn_factor;
|
|
322
|
+
params.yarn_beta_fast = yarn_beta_fast;
|
|
323
|
+
params.yarn_beta_slow = yarn_beta_slow;
|
|
324
|
+
|
|
325
|
+
if (!chat_template.empty()) {
|
|
326
|
+
params.chat_template = chat_template;
|
|
260
327
|
}
|
|
328
|
+
|
|
329
|
+
// Add LoRA adapters
|
|
330
|
+
for (const auto& lora : lora_adapters) {
|
|
331
|
+
common_adapter_lora_info lora_info;
|
|
332
|
+
lora_info.path = lora.first;
|
|
333
|
+
lora_info.scale = lora.second;
|
|
334
|
+
params.lora_adapters.push_back(lora_info);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Initialize using common_init_from_params
|
|
338
|
+
common_init_result result;
|
|
261
339
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
340
|
+
try {
|
|
341
|
+
result = common_init_from_params(params);
|
|
342
|
+
|
|
343
|
+
// Check if initialization was successful
|
|
344
|
+
if (!result.model || !result.context) {
|
|
345
|
+
throw std::runtime_error("Failed to initialize model and context");
|
|
346
|
+
}
|
|
347
|
+
} catch (const std::exception& e) {
|
|
348
|
+
// If we were trying to use GPU and got an error, retry with CPU-only
|
|
349
|
+
if (params.n_gpu_layers > 0) {
|
|
350
|
+
fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
|
|
351
|
+
|
|
352
|
+
params.n_gpu_layers = 0;
|
|
353
|
+
|
|
354
|
+
try {
|
|
355
|
+
result = common_init_from_params(params);
|
|
356
|
+
|
|
357
|
+
if (!result.model || !result.context) {
|
|
358
|
+
throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
|
|
362
|
+
} catch (const std::exception& cpu_e) {
|
|
363
|
+
throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
|
|
364
|
+
}
|
|
365
|
+
} else {
|
|
366
|
+
// Was already CPU-only, re-throw the original error
|
|
367
|
+
throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
|
|
368
|
+
}
|
|
369
|
+
}
|
|
271
370
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
371
|
+
// Create and initialize rn_llama_context
|
|
372
|
+
selfPtr->rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
|
|
373
|
+
selfPtr->rn_ctx_->model = result.model.release();
|
|
374
|
+
selfPtr->rn_ctx_->ctx = result.context.release();
|
|
375
|
+
selfPtr->rn_ctx_->model_loaded = true;
|
|
376
|
+
selfPtr->rn_ctx_->vocab = llama_model_get_vocab(selfPtr->rn_ctx_->model);
|
|
377
|
+
|
|
378
|
+
// Create a rn_common_params from the common_params
|
|
379
|
+
rn_common_params rn_params;
|
|
380
|
+
// Copy the base class fields
|
|
381
|
+
static_cast<common_params&>(rn_params) = params;
|
|
382
|
+
// Set additional fields
|
|
383
|
+
rn_params.use_jinja = params.use_jinja;
|
|
384
|
+
rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
385
|
+
// Now assign to the context
|
|
386
|
+
selfPtr->rn_ctx_->params = rn_params;
|
|
387
|
+
|
|
388
|
+
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
|
|
389
|
+
try {
|
|
390
|
+
common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja);
|
|
391
|
+
} catch (const std::exception & e) {
|
|
392
|
+
// Fallback to chatml if the original template parsing fails
|
|
393
|
+
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Schedule success callback on JS thread to create JSI objects
|
|
397
|
+
invoker->invokeAsync([selfPtr, resolve, runtimePtr]() {
|
|
398
|
+
try {
|
|
399
|
+
// Create the model object and resolve Promise on JS thread
|
|
400
|
+
jsi::Object modelObject = selfPtr->createModelObject(*runtimePtr, selfPtr->rn_ctx_.get());
|
|
401
|
+
resolve->call(*runtimePtr, modelObject);
|
|
402
|
+
} catch (const std::exception& e) {
|
|
403
|
+
// If conversion fails, create a simple error response
|
|
404
|
+
jsi::Object errorObj(*runtimePtr);
|
|
405
|
+
errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
|
|
406
|
+
resolve->call(*runtimePtr, errorObj);
|
|
407
|
+
}
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
} catch (const std::exception& e) {
|
|
411
|
+
// Schedule error callback on JS thread
|
|
412
|
+
std::string errorMsg(e.what());
|
|
413
|
+
fprintf(stderr, "initLlama error: %s\n", errorMsg.c_str());
|
|
414
|
+
invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
|
|
415
|
+
try {
|
|
416
|
+
reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
|
|
417
|
+
} catch (...) {
|
|
418
|
+
// Ignore rejection errors
|
|
419
|
+
}
|
|
420
|
+
});
|
|
316
421
|
}
|
|
422
|
+
}).detach();
|
|
423
|
+
|
|
424
|
+
return jsi::Value::undefined();
|
|
317
425
|
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
} catch (const std::exception& e) {
|
|
322
|
-
fprintf(stderr, "initLlama error: %s\n", e.what());
|
|
323
|
-
throw jsi::JSError(runtime, e.what());
|
|
324
|
-
}
|
|
426
|
+
);
|
|
427
|
+
|
|
428
|
+
return Promise.callAsConstructor(runtime, std::move(executor));
|
|
325
429
|
}
|
|
326
430
|
|
|
327
431
|
jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {
|
package/cpp/PureCppImpl.h
CHANGED
|
@@ -28,7 +28,7 @@ class LlamaCppModel; // Forward declare LlamaCppModel
|
|
|
28
28
|
namespace facebook::react {
|
|
29
29
|
|
|
30
30
|
// Note: The class name is PureCppImpl, and it derives from your project's C++ spec
|
|
31
|
-
class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl> {
|
|
31
|
+
class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl>, public std::enable_shared_from_this<PureCppImpl> {
|
|
32
32
|
public:
|
|
33
33
|
// Constructor
|
|
34
34
|
PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker);
|
package/cpp/SystemUtils.h
CHANGED
|
@@ -44,8 +44,8 @@ public:
|
|
|
44
44
|
* Helper functions to easily set values from a JSI object if the property exists.
|
|
45
45
|
* Returns true if the property was found and the value was set.
|
|
46
46
|
*/
|
|
47
|
-
// Template for
|
|
48
|
-
template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
|
|
47
|
+
// Template for numeric types (excluding bool so bool specialization is used)
|
|
48
|
+
template<typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value && !std::is_same<T, bool>::value>::type>
|
|
49
49
|
static bool setIfExists(jsi::Runtime& rt, const jsi::Object& options, const std::string& key, T& outValue) {
|
|
50
50
|
if (options.hasProperty(rt, key.c_str())) {
|
|
51
51
|
jsi::Value val = options.getProperty(rt, key.c_str());
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -350,7 +350,7 @@ CompletionResult run_chat_completion(
|
|
|
350
350
|
common_chat_templates_inputs template_inputs;
|
|
351
351
|
template_inputs.messages = chat_msgs;
|
|
352
352
|
template_inputs.add_generation_prompt = true;
|
|
353
|
-
template_inputs.use_jinja =
|
|
353
|
+
template_inputs.use_jinja = rn_ctx->params.use_jinja;
|
|
354
354
|
// Note: extract_reasoning field doesn't exist in current llama.cpp version
|
|
355
355
|
// template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
|
|
356
356
|
|
|
@@ -391,6 +391,31 @@ CompletionResult run_chat_completion(
|
|
|
391
391
|
result = run_completion(rn_ctx, cmpl_options, callback);
|
|
392
392
|
|
|
393
393
|
if (result.success) {
|
|
394
|
+
// Parse the generated content for tool calls and structured responses
|
|
395
|
+
common_chat_msg parsed_msg;
|
|
396
|
+
bool has_parsed_content = false;
|
|
397
|
+
|
|
398
|
+
// Only parse if we have tools available and the response isn't empty
|
|
399
|
+
if (!template_inputs.tools.empty() && !result.content.empty()) {
|
|
400
|
+
try {
|
|
401
|
+
// Construct the chat syntax for parsing using the format from template application
|
|
402
|
+
common_chat_syntax syntax;
|
|
403
|
+
syntax.format = chat_params.format; // Use format from template, not from params
|
|
404
|
+
syntax.reasoning_format = rn_ctx->params.reasoning_format;
|
|
405
|
+
syntax.reasoning_in_content = true;
|
|
406
|
+
syntax.thinking_forced_open = false;
|
|
407
|
+
syntax.parse_tool_calls = true;
|
|
408
|
+
|
|
409
|
+
// Parse the generated content for tool calls
|
|
410
|
+
parsed_msg = common_chat_parse(result.content, false, syntax);
|
|
411
|
+
has_parsed_content = true;
|
|
412
|
+
|
|
413
|
+
} catch (const std::exception& e) {
|
|
414
|
+
// If parsing fails, treat as regular content
|
|
415
|
+
has_parsed_content = false;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
394
419
|
// Create OpenAI-compatible response
|
|
395
420
|
json response = {
|
|
396
421
|
{"id", gen_chatcmplid()},
|
|
@@ -403,11 +428,39 @@ CompletionResult run_chat_completion(
|
|
|
403
428
|
json choice = {
|
|
404
429
|
{"index", 0},
|
|
405
430
|
{"message", {
|
|
406
|
-
{"role", "assistant"}
|
|
407
|
-
{"content", result.content}
|
|
431
|
+
{"role", "assistant"}
|
|
408
432
|
}},
|
|
409
433
|
{"finish_reason", "stop"}
|
|
410
434
|
};
|
|
435
|
+
|
|
436
|
+
// Add parsed content and tool calls if available
|
|
437
|
+
if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
|
|
438
|
+
// Set content to the parsed content (may be null for tool-only responses)
|
|
439
|
+
if (!parsed_msg.content.empty()) {
|
|
440
|
+
choice["message"]["content"] = parsed_msg.content;
|
|
441
|
+
} else {
|
|
442
|
+
choice["message"]["content"] = nullptr;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Add tool calls to the message
|
|
446
|
+
json tool_calls = json::array();
|
|
447
|
+
for (const auto& tool_call : parsed_msg.tool_calls) {
|
|
448
|
+
json tc = {
|
|
449
|
+
{"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
|
|
450
|
+
{"type", "function"},
|
|
451
|
+
{"function", {
|
|
452
|
+
{"name", tool_call.name},
|
|
453
|
+
{"arguments", tool_call.arguments}
|
|
454
|
+
}}
|
|
455
|
+
};
|
|
456
|
+
tool_calls.push_back(tc);
|
|
457
|
+
}
|
|
458
|
+
choice["message"]["tool_calls"] = tool_calls;
|
|
459
|
+
choice["finish_reason"] = "tool_calls";
|
|
460
|
+
} else {
|
|
461
|
+
// Regular text response
|
|
462
|
+
choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
|
|
463
|
+
}
|
|
411
464
|
|
|
412
465
|
choices.push_back(choice);
|
|
413
466
|
response["choices"] = choices;
|
package/package.json
CHANGED
package/src/NativeRNLlamaCpp.ts
CHANGED
|
@@ -234,7 +234,7 @@ export interface Spec extends TurboModule {
|
|
|
234
234
|
// Initialize a Llama context with the given model parameters
|
|
235
235
|
initLlama(params: LlamaModelParams): Promise<LlamaContextType & LlamaContextMethods>;
|
|
236
236
|
|
|
237
|
-
// Load model info without creating a full
|
|
237
|
+
// Load model info without creating a full context
|
|
238
238
|
loadLlamaModelInfo(modelPath: string): Promise<{
|
|
239
239
|
n_params: number;
|
|
240
240
|
n_vocab: number;
|