@novastera-oss/llamarn 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cpp/PureCppImpl.cpp +351 -229
- package/cpp/PureCppImpl.h +1 -1
- package/cpp/rn-completion.cpp +42 -48
- package/package.json +1 -1
- package/src/NativeRNLlamaCpp.ts +1 -1
package/cpp/PureCppImpl.cpp
CHANGED
|
@@ -43,267 +43,389 @@ double PureCppImpl::multiply(jsi::Runtime& rt, double a, double b) {
|
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String modelPath) {
|
|
46
|
+
// Parse JSI arguments to native types on JSI thread
|
|
46
47
|
std::string path = modelPath.utf8(runtime);
|
|
47
48
|
SystemUtils::normalizeFilePath(path);
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
//
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
// Create model params
|
|
54
|
-
llama_model_params params = llama_model_default_params();
|
|
55
|
-
params.n_gpu_layers = 0; // Use CPU for model info loading
|
|
56
|
-
|
|
57
|
-
// Load the model
|
|
58
|
-
llama_model* model = llama_model_load_from_file(path.c_str(), params);
|
|
59
|
-
|
|
60
|
-
if (!model) {
|
|
61
|
-
throw std::runtime_error("Failed to load model from file: " + path);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Create result object
|
|
65
|
-
jsi::Object result(runtime);
|
|
50
|
+
if (!jsInvoker_) {
|
|
51
|
+
// Fallback to synchronous if no CallInvoker available - this should not happen normally
|
|
52
|
+
throw jsi::JSError(runtime, "CallInvoker not available for async operation");
|
|
53
|
+
}
|
|
66
54
|
|
|
67
|
-
|
|
68
|
-
|
|
55
|
+
// Create Promise constructor
|
|
56
|
+
auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
|
|
57
|
+
|
|
58
|
+
auto executor = jsi::Function::createFromHostFunction(
|
|
59
|
+
runtime,
|
|
60
|
+
jsi::PropNameID::forAscii(runtime, "executor"),
|
|
61
|
+
2,
|
|
62
|
+
[this, path](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
|
|
63
|
+
|
|
64
|
+
auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
|
|
65
|
+
auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
|
|
66
|
+
|
|
67
|
+
// Create shared references to runtime and invoker for thread safety
|
|
68
|
+
auto runtimePtr = &runtime;
|
|
69
|
+
auto invoker = jsInvoker_;
|
|
70
|
+
auto selfPtr = shared_from_this();
|
|
71
|
+
|
|
72
|
+
// Launch background thread for model info loading
|
|
73
|
+
std::thread([selfPtr, path, resolve, reject, runtimePtr, invoker]() {
|
|
74
|
+
try {
|
|
75
|
+
// Initialize llama backend
|
|
76
|
+
llama_backend_init();
|
|
69
77
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
78
|
+
// Create model params
|
|
79
|
+
llama_model_params params = llama_model_default_params();
|
|
80
|
+
params.n_gpu_layers = 0; // Use CPU for model info loading
|
|
73
81
|
|
|
74
|
-
|
|
75
|
-
|
|
82
|
+
// Load the model
|
|
83
|
+
llama_model* model = llama_model_load_from_file(path.c_str(), params);
|
|
76
84
|
|
|
77
|
-
|
|
78
|
-
|
|
85
|
+
if (!model) {
|
|
86
|
+
throw std::runtime_error("Failed to load model from file: " + path);
|
|
87
|
+
}
|
|
79
88
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
89
|
+
// Get model information (native types)
|
|
90
|
+
double n_params = (double)llama_model_n_params(model);
|
|
91
|
+
const llama_vocab* vocab = llama_model_get_vocab(model);
|
|
92
|
+
double n_vocab = (double)llama_vocab_n_tokens(vocab);
|
|
93
|
+
double n_context = (double)llama_model_n_ctx_train(model);
|
|
94
|
+
double n_embd = (double)llama_model_n_embd(model);
|
|
95
|
+
|
|
96
|
+
// Get model description
|
|
97
|
+
char buf[512];
|
|
98
|
+
llama_model_desc(model, buf, sizeof(buf));
|
|
99
|
+
std::string description = buf[0] ? buf : "Unknown model";
|
|
100
|
+
|
|
101
|
+
// Check if GPU is supported
|
|
102
|
+
bool gpuSupported = llama_supports_gpu_offload();
|
|
103
|
+
|
|
104
|
+
// Calculate optimal GPU layers if GPU is supported
|
|
105
|
+
int optimalGpuLayers = 0;
|
|
106
|
+
if (gpuSupported) {
|
|
107
|
+
optimalGpuLayers = SystemUtils::getOptimalGpuLayers(model);
|
|
108
|
+
}
|
|
85
109
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
110
|
+
// Extract quantization type from model description
|
|
111
|
+
std::string desc(buf);
|
|
112
|
+
std::string quantType = "Unknown";
|
|
113
|
+
size_t qPos = desc.find(" Q");
|
|
114
|
+
if (qPos != std::string::npos && qPos + 5 <= desc.length()) {
|
|
115
|
+
// Extract quantization string (like Q4_K, Q5_K, Q8_0)
|
|
116
|
+
quantType = desc.substr(qPos + 1, 4);
|
|
117
|
+
// Remove any trailing non-alphanumeric characters
|
|
118
|
+
quantType.erase(std::find_if(quantType.rbegin(), quantType.rend(), [](char c) {
|
|
119
|
+
return std::isalnum(c);
|
|
120
|
+
}).base(), quantType.end());
|
|
121
|
+
}
|
|
89
122
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
123
|
+
// Free the model
|
|
124
|
+
llama_model_free(model);
|
|
125
|
+
|
|
126
|
+
// Schedule success callback on JS thread to create JSI objects
|
|
127
|
+
invoker->invokeAsync([selfPtr, resolve, n_params, n_vocab, n_context, n_embd, description, gpuSupported, optimalGpuLayers, quantType, runtimePtr]() {
|
|
128
|
+
try {
|
|
129
|
+
// Create result object on JS thread
|
|
130
|
+
jsi::Object result(*runtimePtr);
|
|
131
|
+
result.setProperty(*runtimePtr, "n_params", jsi::Value(n_params));
|
|
132
|
+
result.setProperty(*runtimePtr, "n_vocab", jsi::Value(n_vocab));
|
|
133
|
+
result.setProperty(*runtimePtr, "n_context", jsi::Value(n_context));
|
|
134
|
+
result.setProperty(*runtimePtr, "n_embd", jsi::Value(n_embd));
|
|
135
|
+
result.setProperty(*runtimePtr, "description", jsi::String::createFromUtf8(*runtimePtr, description));
|
|
136
|
+
result.setProperty(*runtimePtr, "gpuSupported", jsi::Value(gpuSupported));
|
|
137
|
+
result.setProperty(*runtimePtr, "optimalGpuLayers", jsi::Value(optimalGpuLayers));
|
|
138
|
+
result.setProperty(*runtimePtr, "quant_type", jsi::String::createFromUtf8(*runtimePtr, quantType));
|
|
139
|
+
result.setProperty(*runtimePtr, "architecture", jsi::String::createFromUtf8(*runtimePtr, "Unknown"));
|
|
140
|
+
|
|
141
|
+
resolve->call(*runtimePtr, result);
|
|
142
|
+
} catch (const std::exception& e) {
|
|
143
|
+
// If conversion fails, create a simple error response
|
|
144
|
+
jsi::Object errorObj(*runtimePtr);
|
|
145
|
+
errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
|
|
146
|
+
resolve->call(*runtimePtr, errorObj);
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
} catch (const std::exception& e) {
|
|
151
|
+
// Schedule error callback on JS thread
|
|
152
|
+
std::string errorMsg(e.what());
|
|
153
|
+
invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
|
|
154
|
+
try {
|
|
155
|
+
reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
|
|
156
|
+
} catch (...) {
|
|
157
|
+
// Ignore rejection errors
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
}).detach();
|
|
162
|
+
|
|
163
|
+
return jsi::Value::undefined();
|
|
108
164
|
}
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
result.setProperty(runtime, "architecture",
|
|
113
|
-
jsi::String::createFromUtf8(runtime, "Unknown"));
|
|
114
|
-
|
|
115
|
-
// Free the model
|
|
116
|
-
llama_model_free(model);
|
|
117
|
-
|
|
118
|
-
return result;
|
|
119
|
-
} catch (const std::exception& e) {
|
|
120
|
-
jsi::Object error(runtime);
|
|
121
|
-
error.setProperty(runtime, "message", jsi::String::createFromUtf8(runtime, e.what()));
|
|
122
|
-
throw jsi::JSError(runtime, error.getProperty(runtime, "message").asString(runtime));
|
|
123
|
-
}
|
|
165
|
+
);
|
|
166
|
+
|
|
167
|
+
return Promise.callAsConstructor(runtime, std::move(executor));
|
|
124
168
|
}
|
|
125
169
|
|
|
126
170
|
jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
if (!options.hasProperty(runtime, "model")) {
|
|
132
|
-
throw std::runtime_error("model path is required");
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
// Initialize llama backend
|
|
136
|
-
llama_backend_init();
|
|
137
|
-
|
|
138
|
-
std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
|
|
139
|
-
SystemUtils::normalizeFilePath(model_path);
|
|
140
|
-
|
|
141
|
-
// Initialize params with defaults
|
|
142
|
-
rn_common_params params;
|
|
143
|
-
|
|
144
|
-
// Set default sampling parameters
|
|
145
|
-
params.sampling = common_params_sampling();
|
|
146
|
-
|
|
147
|
-
// Set model path
|
|
148
|
-
params.model.path = model_path;
|
|
149
|
-
|
|
150
|
-
// Override defaults with user settings if provided
|
|
151
|
-
SystemUtils::setIfExists(runtime, options, "n_ctx", params.n_ctx);
|
|
152
|
-
SystemUtils::setIfExists(runtime, options, "n_batch", params.n_batch);
|
|
153
|
-
SystemUtils::setIfExists(runtime, options, "n_ubatch", params.n_ubatch);
|
|
154
|
-
SystemUtils::setIfExists(runtime, options, "n_keep", params.n_keep);
|
|
155
|
-
|
|
156
|
-
// Memory and resource options - MUST respect user settings
|
|
157
|
-
SystemUtils::setIfExists(runtime, options, "use_mmap", params.use_mmap);
|
|
158
|
-
SystemUtils::setIfExists(runtime, options, "use_mlock", params.use_mlock);
|
|
159
|
-
SystemUtils::setIfExists(runtime, options, "use_jinja", params.use_jinja);
|
|
160
|
-
|
|
161
|
-
// Extract threading parameters (preserve custom thread logic)
|
|
162
|
-
int n_threads = 0; // 0 = auto
|
|
163
|
-
if (options.hasProperty(runtime, "n_threads")) {
|
|
164
|
-
n_threads = options.getProperty(runtime, "n_threads").asNumber();
|
|
165
|
-
} else {
|
|
166
|
-
n_threads = SystemUtils::getOptimalThreadCount();
|
|
167
|
-
}
|
|
168
|
-
params.cpuparams.n_threads = n_threads;
|
|
169
|
-
|
|
170
|
-
// Set n_gpu_layers (preserve custom GPU logic)
|
|
171
|
-
int n_gpu_layers = 0;
|
|
172
|
-
bool gpuSupported = llama_supports_gpu_offload();
|
|
173
|
-
if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
|
|
174
|
-
n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
|
|
175
|
-
}
|
|
176
|
-
params.n_gpu_layers = n_gpu_layers;
|
|
177
|
-
|
|
178
|
-
// Additional model parameters
|
|
179
|
-
SystemUtils::setIfExists(runtime, options, "logits_file", params.logits_file);
|
|
180
|
-
SystemUtils::setIfExists(runtime, options, "embedding", params.embedding);
|
|
181
|
-
SystemUtils::setIfExists(runtime, options, "rope_freq_base", params.rope_freq_base);
|
|
182
|
-
SystemUtils::setIfExists(runtime, options, "rope_freq_scale", params.rope_freq_scale);
|
|
183
|
-
|
|
184
|
-
// Sampling parameters
|
|
185
|
-
SystemUtils::setIfExists(runtime, options, "seed", params.sampling.seed);
|
|
171
|
+
// Parse JSI arguments to native types on JSI thread
|
|
172
|
+
if (!options.hasProperty(runtime, "model")) {
|
|
173
|
+
throw jsi::JSError(runtime, "model path is required");
|
|
174
|
+
}
|
|
186
175
|
|
|
187
|
-
|
|
188
|
-
|
|
176
|
+
if (!jsInvoker_) {
|
|
177
|
+
// Fallback to synchronous if no CallInvoker available - this should not happen normally
|
|
178
|
+
throw jsi::JSError(runtime, "CallInvoker not available for async operation");
|
|
179
|
+
}
|
|
189
180
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
181
|
+
// Parse all options to native types on JSI thread
|
|
182
|
+
std::string model_path = options.getProperty(runtime, "model").asString(runtime).utf8(runtime);
|
|
183
|
+
SystemUtils::normalizeFilePath(model_path);
|
|
184
|
+
|
|
185
|
+
// Parse all numeric/boolean options to native types
|
|
186
|
+
int n_ctx = 2048; // defaults
|
|
187
|
+
int n_batch = 512;
|
|
188
|
+
int n_ubatch = 512;
|
|
189
|
+
int n_keep = 0;
|
|
190
|
+
bool use_mmap = true;
|
|
191
|
+
bool use_mlock = false;
|
|
192
|
+
bool use_jinja = false;
|
|
193
|
+
bool embedding = false;
|
|
194
|
+
int n_threads = 0;
|
|
195
|
+
int n_gpu_layers = 0;
|
|
196
|
+
std::string logits_file;
|
|
197
|
+
float rope_freq_base = 10000.0f;
|
|
198
|
+
float rope_freq_scale = 1.0f;
|
|
199
|
+
uint32_t seed = 4294967295U; // default seed
|
|
200
|
+
int verbosity = 0;
|
|
201
|
+
float yarn_ext_factor = 1.0f;
|
|
202
|
+
float yarn_attn_factor = 1.0f;
|
|
203
|
+
float yarn_beta_fast = 32.0f;
|
|
204
|
+
float yarn_beta_slow = 1.0f;
|
|
205
|
+
std::string chat_template;
|
|
206
|
+
|
|
207
|
+
// Parse options to native types
|
|
208
|
+
SystemUtils::setIfExists(runtime, options, "n_ctx", n_ctx);
|
|
209
|
+
SystemUtils::setIfExists(runtime, options, "n_batch", n_batch);
|
|
210
|
+
SystemUtils::setIfExists(runtime, options, "n_ubatch", n_ubatch);
|
|
211
|
+
SystemUtils::setIfExists(runtime, options, "n_keep", n_keep);
|
|
212
|
+
SystemUtils::setIfExists(runtime, options, "use_mmap", use_mmap);
|
|
213
|
+
SystemUtils::setIfExists(runtime, options, "use_mlock", use_mlock);
|
|
214
|
+
SystemUtils::setIfExists(runtime, options, "use_jinja", use_jinja);
|
|
215
|
+
SystemUtils::setIfExists(runtime, options, "embedding", embedding);
|
|
216
|
+
SystemUtils::setIfExists(runtime, options, "rope_freq_base", rope_freq_base);
|
|
217
|
+
SystemUtils::setIfExists(runtime, options, "rope_freq_scale", rope_freq_scale);
|
|
218
|
+
SystemUtils::setIfExists(runtime, options, "seed", seed);
|
|
219
|
+
SystemUtils::setIfExists(runtime, options, "verbose", verbosity);
|
|
220
|
+
SystemUtils::setIfExists(runtime, options, "logits_file", logits_file);
|
|
221
|
+
SystemUtils::setIfExists(runtime, options, "chat_template", chat_template);
|
|
222
|
+
|
|
223
|
+
if (options.hasProperty(runtime, "n_threads")) {
|
|
224
|
+
n_threads = options.getProperty(runtime, "n_threads").asNumber();
|
|
225
|
+
} else {
|
|
226
|
+
n_threads = SystemUtils::getOptimalThreadCount();
|
|
227
|
+
}
|
|
203
228
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
}
|
|
229
|
+
bool gpuSupported = llama_supports_gpu_offload();
|
|
230
|
+
if (options.hasProperty(runtime, "n_gpu_layers") && gpuSupported) {
|
|
231
|
+
n_gpu_layers = options.getProperty(runtime, "n_gpu_layers").asNumber();
|
|
232
|
+
}
|
|
209
233
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
lora.path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
|
|
223
|
-
|
|
224
|
-
// Get scale if provided
|
|
225
|
-
lora.scale = 1.0f; // Default scale
|
|
226
|
-
if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
|
|
227
|
-
lora.scale = adapter.getProperty(runtime, "scale").asNumber();
|
|
228
|
-
}
|
|
234
|
+
if (options.hasProperty(runtime, "yarn_ext_factor")) {
|
|
235
|
+
yarn_ext_factor = options.getProperty(runtime, "yarn_ext_factor").asNumber();
|
|
236
|
+
}
|
|
237
|
+
if (options.hasProperty(runtime, "yarn_attn_factor")) {
|
|
238
|
+
yarn_attn_factor = options.getProperty(runtime, "yarn_attn_factor").asNumber();
|
|
239
|
+
}
|
|
240
|
+
if (options.hasProperty(runtime, "yarn_beta_fast")) {
|
|
241
|
+
yarn_beta_fast = options.getProperty(runtime, "yarn_beta_fast").asNumber();
|
|
242
|
+
}
|
|
243
|
+
if (options.hasProperty(runtime, "yarn_beta_slow")) {
|
|
244
|
+
yarn_beta_slow = options.getProperty(runtime, "yarn_beta_slow").asNumber();
|
|
245
|
+
}
|
|
229
246
|
|
|
230
|
-
|
|
247
|
+
// Parse LoRA adapters to native structure
|
|
248
|
+
std::vector<std::pair<std::string, float>> lora_adapters;
|
|
249
|
+
if (options.hasProperty(runtime, "lora_adapters") && options.getProperty(runtime, "lora_adapters").isObject()) {
|
|
250
|
+
jsi::Object lora_obj = options.getProperty(runtime, "lora_adapters").asObject(runtime);
|
|
251
|
+
if (lora_obj.isArray(runtime)) {
|
|
252
|
+
jsi::Array lora_array = lora_obj.asArray(runtime);
|
|
253
|
+
size_t n_lora = lora_array.size(runtime);
|
|
254
|
+
|
|
255
|
+
for (size_t i = 0; i < n_lora; i++) {
|
|
256
|
+
if (lora_array.getValueAtIndex(runtime, i).isObject()) {
|
|
257
|
+
jsi::Object adapter = lora_array.getValueAtIndex(runtime, i).asObject(runtime);
|
|
258
|
+
if (adapter.hasProperty(runtime, "path") && adapter.getProperty(runtime, "path").isString()) {
|
|
259
|
+
std::string lora_path = adapter.getProperty(runtime, "path").asString(runtime).utf8(runtime);
|
|
260
|
+
float lora_scale = 1.0f; // Default scale
|
|
261
|
+
if (adapter.hasProperty(runtime, "scale") && adapter.getProperty(runtime, "scale").isNumber()) {
|
|
262
|
+
lora_scale = adapter.getProperty(runtime, "scale").asNumber();
|
|
231
263
|
}
|
|
264
|
+
lora_adapters.emplace_back(lora_path, lora_scale);
|
|
232
265
|
}
|
|
233
266
|
}
|
|
234
267
|
}
|
|
235
268
|
}
|
|
269
|
+
}
|
|
236
270
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
271
|
+
// Create Promise constructor
|
|
272
|
+
auto Promise = runtime.global().getPropertyAsFunction(runtime, "Promise");
|
|
273
|
+
|
|
274
|
+
auto executor = jsi::Function::createFromHostFunction(
|
|
275
|
+
runtime,
|
|
276
|
+
jsi::PropNameID::forAscii(runtime, "executor"),
|
|
277
|
+
2,
|
|
278
|
+
[this, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
|
|
242
279
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
params.n_gpu_layers = 0;
|
|
254
|
-
|
|
280
|
+
auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
|
|
281
|
+
auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
|
|
282
|
+
|
|
283
|
+
// Create shared references to runtime and invoker for thread safety
|
|
284
|
+
auto runtimePtr = &runtime;
|
|
285
|
+
auto invoker = jsInvoker_;
|
|
286
|
+
auto selfPtr = shared_from_this();
|
|
287
|
+
|
|
288
|
+
// Launch background thread for model initialization
|
|
289
|
+
std::thread([selfPtr, model_path, n_ctx, n_batch, n_ubatch, n_keep, use_mmap, use_mlock, use_jinja, embedding, n_threads, n_gpu_layers, logits_file, rope_freq_base, rope_freq_scale, seed, verbosity, yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow, chat_template, lora_adapters, resolve, reject, runtimePtr, invoker]() {
|
|
255
290
|
try {
|
|
256
|
-
|
|
291
|
+
// Thread-safe access to member variables
|
|
292
|
+
std::lock_guard<std::mutex> lock(selfPtr->mutex_);
|
|
293
|
+
|
|
294
|
+
// Initialize llama backend
|
|
295
|
+
llama_backend_init();
|
|
296
|
+
|
|
297
|
+
// Initialize params with defaults
|
|
298
|
+
rn_common_params params;
|
|
299
|
+
|
|
300
|
+
// Set default sampling parameters
|
|
301
|
+
params.sampling = common_params_sampling();
|
|
302
|
+
|
|
303
|
+
// Set all parsed native values
|
|
304
|
+
params.model.path = model_path;
|
|
305
|
+
params.n_ctx = n_ctx;
|
|
306
|
+
params.n_batch = n_batch;
|
|
307
|
+
params.n_ubatch = n_ubatch;
|
|
308
|
+
params.n_keep = n_keep;
|
|
309
|
+
params.use_mmap = use_mmap;
|
|
310
|
+
params.use_mlock = use_mlock;
|
|
311
|
+
params.use_jinja = use_jinja;
|
|
312
|
+
params.embedding = embedding;
|
|
313
|
+
params.cpuparams.n_threads = n_threads;
|
|
314
|
+
params.n_gpu_layers = n_gpu_layers;
|
|
315
|
+
params.logits_file = logits_file;
|
|
316
|
+
params.rope_freq_base = rope_freq_base;
|
|
317
|
+
params.rope_freq_scale = rope_freq_scale;
|
|
318
|
+
params.sampling.seed = seed;
|
|
319
|
+
params.verbosity = verbosity;
|
|
320
|
+
params.yarn_ext_factor = yarn_ext_factor;
|
|
321
|
+
params.yarn_attn_factor = yarn_attn_factor;
|
|
322
|
+
params.yarn_beta_fast = yarn_beta_fast;
|
|
323
|
+
params.yarn_beta_slow = yarn_beta_slow;
|
|
324
|
+
|
|
325
|
+
if (!chat_template.empty()) {
|
|
326
|
+
params.chat_template = chat_template;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Add LoRA adapters
|
|
330
|
+
for (const auto& lora : lora_adapters) {
|
|
331
|
+
common_adapter_lora_info lora_info;
|
|
332
|
+
lora_info.path = lora.first;
|
|
333
|
+
lora_info.scale = lora.second;
|
|
334
|
+
params.lora_adapters.push_back(lora_info);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Initialize using common_init_from_params
|
|
338
|
+
common_init_result result;
|
|
257
339
|
|
|
258
|
-
|
|
259
|
-
|
|
340
|
+
try {
|
|
341
|
+
result = common_init_from_params(params);
|
|
342
|
+
|
|
343
|
+
// Check if initialization was successful
|
|
344
|
+
if (!result.model || !result.context) {
|
|
345
|
+
throw std::runtime_error("Failed to initialize model and context");
|
|
346
|
+
}
|
|
347
|
+
} catch (const std::exception& e) {
|
|
348
|
+
// If we were trying to use GPU and got an error, retry with CPU-only
|
|
349
|
+
if (params.n_gpu_layers > 0) {
|
|
350
|
+
fprintf(stderr, "GPU initialization failed (%s), retrying with CPU-only\n", e.what());
|
|
351
|
+
|
|
352
|
+
params.n_gpu_layers = 0;
|
|
353
|
+
|
|
354
|
+
try {
|
|
355
|
+
result = common_init_from_params(params);
|
|
356
|
+
|
|
357
|
+
if (!result.model || !result.context) {
|
|
358
|
+
throw std::runtime_error("Failed to initialize model and context even with CPU-only mode");
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
fprintf(stderr, "Successfully recovered with CPU-only mode after GPU failure\n");
|
|
362
|
+
} catch (const std::exception& cpu_e) {
|
|
363
|
+
throw std::runtime_error(std::string("Model initialization failed: ") + cpu_e.what());
|
|
364
|
+
}
|
|
365
|
+
} else {
|
|
366
|
+
// Was already CPU-only, re-throw the original error
|
|
367
|
+
throw std::runtime_error(std::string("Model initialization failed: ") + e.what());
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// Create and initialize rn_llama_context
|
|
372
|
+
selfPtr->rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
|
|
373
|
+
selfPtr->rn_ctx_->model = result.model.release();
|
|
374
|
+
selfPtr->rn_ctx_->ctx = result.context.release();
|
|
375
|
+
selfPtr->rn_ctx_->model_loaded = true;
|
|
376
|
+
selfPtr->rn_ctx_->vocab = llama_model_get_vocab(selfPtr->rn_ctx_->model);
|
|
377
|
+
|
|
378
|
+
// Create a rn_common_params from the common_params
|
|
379
|
+
rn_common_params rn_params;
|
|
380
|
+
// Copy the base class fields
|
|
381
|
+
static_cast<common_params&>(rn_params) = params;
|
|
382
|
+
// Set additional fields
|
|
383
|
+
rn_params.use_jinja = params.use_jinja;
|
|
384
|
+
rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
385
|
+
// Now assign to the context
|
|
386
|
+
selfPtr->rn_ctx_->params = rn_params;
|
|
387
|
+
|
|
388
|
+
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
|
|
389
|
+
try {
|
|
390
|
+
common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja);
|
|
391
|
+
} catch (const std::exception & e) {
|
|
392
|
+
// Fallback to chatml if the original template parsing fails
|
|
393
|
+
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
|
|
260
394
|
}
|
|
395
|
+
|
|
396
|
+
// Schedule success callback on JS thread to create JSI objects
|
|
397
|
+
invoker->invokeAsync([selfPtr, resolve, runtimePtr]() {
|
|
398
|
+
try {
|
|
399
|
+
// Create the model object and resolve Promise on JS thread
|
|
400
|
+
jsi::Object modelObject = selfPtr->createModelObject(*runtimePtr, selfPtr->rn_ctx_.get());
|
|
401
|
+
resolve->call(*runtimePtr, modelObject);
|
|
402
|
+
} catch (const std::exception& e) {
|
|
403
|
+
// If conversion fails, create a simple error response
|
|
404
|
+
jsi::Object errorObj(*runtimePtr);
|
|
405
|
+
errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
|
|
406
|
+
resolve->call(*runtimePtr, errorObj);
|
|
407
|
+
}
|
|
408
|
+
});
|
|
261
409
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
410
|
+
} catch (const std::exception& e) {
|
|
411
|
+
// Schedule error callback on JS thread
|
|
412
|
+
std::string errorMsg(e.what());
|
|
413
|
+
fprintf(stderr, "initLlama error: %s\n", errorMsg.c_str());
|
|
414
|
+
invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
|
|
415
|
+
try {
|
|
416
|
+
reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
|
|
417
|
+
} catch (...) {
|
|
418
|
+
// Ignore rejection errors
|
|
419
|
+
}
|
|
420
|
+
});
|
|
265
421
|
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
// Create and initialize rn_llama_context
|
|
273
|
-
rn_ctx_ = std::make_unique<facebook::react::rn_llama_context>();
|
|
274
|
-
rn_ctx_->model = result.model.release();
|
|
275
|
-
rn_ctx_->ctx = result.context.release();
|
|
276
|
-
rn_ctx_->model_loaded = true;
|
|
277
|
-
rn_ctx_->vocab = llama_model_get_vocab(rn_ctx_->model);
|
|
278
|
-
|
|
279
|
-
// Create a rn_common_params from the common_params
|
|
280
|
-
rn_common_params rn_params;
|
|
281
|
-
// Copy the base class fields
|
|
282
|
-
static_cast<common_params&>(rn_params) = params;
|
|
283
|
-
// Set additional fields
|
|
284
|
-
rn_params.use_jinja = params.use_jinja;
|
|
285
|
-
rn_params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
286
|
-
// Don't force a specific chat format - let the template system auto-detect based on model and tools
|
|
287
|
-
// rn_params.chat_format = COMMON_CHAT_FORMAT_GENERIC;
|
|
288
|
-
// Now assign to the context
|
|
289
|
-
rn_ctx_->params = rn_params;
|
|
290
|
-
|
|
291
|
-
rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, params.chat_template);
|
|
292
|
-
try {
|
|
293
|
-
common_chat_format_example(rn_ctx_->chat_templates.get(), params.use_jinja);
|
|
294
|
-
} catch (const std::exception & e) {
|
|
295
|
-
// Fallback to chatml if the original template parsing fails
|
|
296
|
-
rn_ctx_->chat_templates = common_chat_templates_init(rn_ctx_->model, "chatml");
|
|
422
|
+
}).detach();
|
|
423
|
+
|
|
424
|
+
return jsi::Value::undefined();
|
|
297
425
|
}
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
return createModelObject(runtime, rn_ctx_.get());
|
|
302
|
-
} catch (const std::exception& e) {
|
|
303
|
-
// We can keep this top-level error log as it's for initialization failure
|
|
304
|
-
fprintf(stderr, "initLlama error: %s\n", e.what());
|
|
305
|
-
throw jsi::JSError(runtime, e.what());
|
|
306
|
-
}
|
|
426
|
+
);
|
|
427
|
+
|
|
428
|
+
return Promise.callAsConstructor(runtime, std::move(executor));
|
|
307
429
|
}
|
|
308
430
|
|
|
309
431
|
jsi::Object PureCppImpl::createModelObject(jsi::Runtime& runtime, rn_llama_context* rn_ctx) {
|
package/cpp/PureCppImpl.h
CHANGED
|
@@ -28,7 +28,7 @@ class LlamaCppModel; // Forward declare LlamaCppModel
|
|
|
28
28
|
namespace facebook::react {
|
|
29
29
|
|
|
30
30
|
// Note: The class name is PureCppImpl, and it derives from your project's C++ spec
|
|
31
|
-
class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl> {
|
|
31
|
+
class PureCppImpl : public NativeRNLlamaCppCxxSpec<PureCppImpl>, public std::enable_shared_from_this<PureCppImpl> {
|
|
32
32
|
public:
|
|
33
33
|
// Constructor
|
|
34
34
|
PureCppImpl(std::shared_ptr<CallInvoker> jsInvoker);
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -147,30 +147,13 @@ CompletionResult run_completion(
|
|
|
147
147
|
json data = options.to_json();
|
|
148
148
|
// Prepare the sampling parameters
|
|
149
149
|
const auto& params = rn_ctx->params;
|
|
150
|
-
|
|
151
|
-
//
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
|
|
156
|
-
result.success = false;
|
|
157
|
-
result.error_msg = "Empty prompt";
|
|
158
|
-
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
159
|
-
return result;
|
|
160
|
-
}
|
|
161
|
-
state.prompt_tokens = std::move(tokenized_prompts[0]);
|
|
162
|
-
} else {
|
|
163
|
-
result.success = false;
|
|
164
|
-
result.error_msg = "No prompt provided";
|
|
165
|
-
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
166
|
-
return result;
|
|
150
|
+
|
|
151
|
+
// Create a copy of sampling parameters and apply grammar if provided
|
|
152
|
+
common_params_sampling sampling_params = params.sampling;
|
|
153
|
+
if (!options.grammar.empty()) {
|
|
154
|
+
sampling_params.grammar = options.grammar;
|
|
167
155
|
}
|
|
168
156
|
|
|
169
|
-
// Configure state
|
|
170
|
-
state.n_ctx = llama_n_ctx(rn_ctx->ctx);
|
|
171
|
-
state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
|
|
172
|
-
state.n_remaining = state.n_predict;
|
|
173
|
-
|
|
174
157
|
// Parse tool_choice
|
|
175
158
|
if (options.tool_choice == "auto") {
|
|
176
159
|
state.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
|
@@ -179,8 +162,8 @@ CompletionResult run_completion(
|
|
|
179
162
|
} else if (options.tool_choice == "required") {
|
|
180
163
|
state.tool_choice = COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
181
164
|
}
|
|
182
|
-
// Initialize the sampler
|
|
183
|
-
state.sampler = common_sampler_init(rn_ctx->model,
|
|
165
|
+
// Initialize the sampler with the updated sampling parameters
|
|
166
|
+
state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
|
|
184
167
|
if (!state.sampler) {
|
|
185
168
|
result.success = false;
|
|
186
169
|
result.error_msg = "Failed to initialize sampler";
|
|
@@ -201,6 +184,29 @@ CompletionResult run_completion(
|
|
|
201
184
|
}
|
|
202
185
|
}
|
|
203
186
|
|
|
187
|
+
// Set the prompt
|
|
188
|
+
if (data.contains("prompt")) {
|
|
189
|
+
// Tokenize the prompt
|
|
190
|
+
const auto& tokenized_prompts = tokenize_input_prompts(rn_ctx->vocab, data["prompt"], true, true);
|
|
191
|
+
if (tokenized_prompts.empty() || tokenized_prompts[0].empty()) {
|
|
192
|
+
result.success = false;
|
|
193
|
+
result.error_msg = "Empty prompt";
|
|
194
|
+
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
195
|
+
return result;
|
|
196
|
+
}
|
|
197
|
+
state.prompt_tokens = std::move(tokenized_prompts[0]);
|
|
198
|
+
} else {
|
|
199
|
+
result.success = false;
|
|
200
|
+
result.error_msg = "No prompt provided";
|
|
201
|
+
result.error_type = RN_ERROR_INVALID_PARAM;
|
|
202
|
+
return result;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Configure state
|
|
206
|
+
state.n_ctx = llama_n_ctx(rn_ctx->ctx);
|
|
207
|
+
state.n_predict = options.n_predict > 0 ? options.n_predict : params.n_predict;
|
|
208
|
+
state.n_remaining = state.n_predict;
|
|
209
|
+
|
|
204
210
|
// Process the prompt
|
|
205
211
|
for (int i = 0; i < (int)state.prompt_tokens.size(); ++i) {
|
|
206
212
|
llama_token token = state.prompt_tokens[i];
|
|
@@ -222,7 +228,11 @@ CompletionResult run_completion(
|
|
|
222
228
|
return result;
|
|
223
229
|
}
|
|
224
230
|
|
|
225
|
-
|
|
231
|
+
// Only accept tokens during prompt processing if no grammar is present
|
|
232
|
+
// Grammar-based sampling needs to start fresh from the generation phase
|
|
233
|
+
if (sampling_params.grammar.empty()) {
|
|
234
|
+
common_sampler_accept(state.sampler, token, true);
|
|
235
|
+
}
|
|
226
236
|
state.n_past++;
|
|
227
237
|
}
|
|
228
238
|
|
|
@@ -435,31 +445,15 @@ CompletionResult run_chat_completion(
|
|
|
435
445
|
|
|
436
446
|
// Add parsed content and tool calls if available
|
|
437
447
|
if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
|
|
438
|
-
//
|
|
439
|
-
|
|
440
|
-
choice["message"]["content"] = parsed_msg.content;
|
|
441
|
-
} else {
|
|
442
|
-
choice["message"]["content"] = nullptr;
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
// Add tool calls to the message
|
|
446
|
-
json tool_calls = json::array();
|
|
447
|
-
for (const auto& tool_call : parsed_msg.tool_calls) {
|
|
448
|
-
json tc = {
|
|
449
|
-
{"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
|
|
450
|
-
{"type", "function"},
|
|
451
|
-
{"function", {
|
|
452
|
-
{"name", tool_call.name},
|
|
453
|
-
{"arguments", tool_call.arguments}
|
|
454
|
-
}}
|
|
455
|
-
};
|
|
456
|
-
tool_calls.push_back(tc);
|
|
457
|
-
}
|
|
458
|
-
choice["message"]["tool_calls"] = tool_calls;
|
|
448
|
+
// Use the server.cpp approach: let the common_chat_msg handle the JSON conversion
|
|
449
|
+
choice["message"] = parsed_msg.to_json_oaicompat<json>();
|
|
459
450
|
choice["finish_reason"] = "tool_calls";
|
|
451
|
+
} else if (has_parsed_content && !parsed_msg.content.empty()) {
|
|
452
|
+
// Regular text response with parsed content
|
|
453
|
+
choice["message"]["content"] = parsed_msg.content;
|
|
460
454
|
} else {
|
|
461
|
-
//
|
|
462
|
-
choice["message"]["content"] =
|
|
455
|
+
// Fallback to raw content if parsing failed or no tools
|
|
456
|
+
choice["message"]["content"] = result.content;
|
|
463
457
|
}
|
|
464
458
|
|
|
465
459
|
choices.push_back(choice);
|
package/package.json
CHANGED
package/src/NativeRNLlamaCpp.ts
CHANGED
|
@@ -234,7 +234,7 @@ export interface Spec extends TurboModule {
|
|
|
234
234
|
// Initialize a Llama context with the given model parameters
|
|
235
235
|
initLlama(params: LlamaModelParams): Promise<LlamaContextType & LlamaContextMethods>;
|
|
236
236
|
|
|
237
|
-
// Load model info without creating a full
|
|
237
|
+
// Load model info without creating a full context
|
|
238
238
|
loadLlamaModelInfo(modelPath: string): Promise<{
|
|
239
239
|
n_params: number;
|
|
240
240
|
n_vocab: number;
|