@fugood/llama.node 1.3.2 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +8 -3
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +5 -5
- package/src/LlamaCompletionWorker.cpp +33 -33
- package/src/LlamaContext.cpp +17 -16
- package/src/llama.cpp/CMakeLists.txt +4 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -37
- package/src/llama.cpp/common/common.cpp +1 -5
- package/src/llama.cpp/common/download.cpp +47 -29
- package/src/llama.cpp/common/log.cpp +6 -0
- package/src/llama.cpp/common/log.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +71 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +29 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +235 -34
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +289 -277
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +95 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +10 -0
- package/src/llama.cpp/src/CMakeLists.txt +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +32 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +2 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -0
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-sampling.cpp +10 -5
- package/src/llama.cpp/src/llama-vocab.cpp +16 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/afmoe.cpp +187 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +77 -0
package/CMakeLists.txt
CHANGED
|
@@ -120,9 +120,14 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DEFINED GGML_OPENMP OR GGML_O
|
|
|
120
120
|
endif()
|
|
121
121
|
|
|
122
122
|
set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
|
|
123
|
-
|
|
123
|
+
set(LLAMA_BUILD_TOOLS OFF CACHE BOOL "Build tools")
|
|
124
|
+
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "Build tests")
|
|
125
|
+
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "Build server")
|
|
126
|
+
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "Build examples")
|
|
124
127
|
set(LLAMA_CURL OFF CACHE BOOL "Build curl")
|
|
125
128
|
|
|
129
|
+
set(LLAMA_INSTALL_VERSION "0.0.0") # TODO: Set the version number (0.0.<BUILD_NUMBER>)
|
|
130
|
+
|
|
126
131
|
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
|
|
127
132
|
|
|
128
133
|
add_definitions(-DGGML_MAX_NAME=80)
|
|
@@ -172,7 +177,7 @@ if (NOT MSVC AND CMAKE_SYSTEM_NAME STREQUAL "Windows")
|
|
|
172
177
|
|
|
173
178
|
add_library(win_dynamic_load ${WIN_DYNAMIC_LOAD_SRC})
|
|
174
179
|
set_target_properties(win_dynamic_load PROPERTIES COMPILE_FLAGS "-Wno-implicit-function-declaration")
|
|
175
|
-
|
|
180
|
+
|
|
176
181
|
unset(CMAKE_JS_SRC)
|
|
177
182
|
unset(CMAKE_JS_LIB)
|
|
178
183
|
unset(CMAKE_JS_NODELIB_DEF)
|
|
@@ -207,7 +212,7 @@ if(CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
|
|
|
207
212
|
endif()
|
|
208
213
|
|
|
209
214
|
if (GGML_METAL AND NOT GGML_METAL_EMBED_LIBRARY)
|
|
210
|
-
# copy ${CMAKE_BINARY_DIR}/bin/default.metallib
|
|
215
|
+
# copy ${CMAKE_BINARY_DIR}/bin/default.metallib
|
|
211
216
|
add_custom_command(
|
|
212
217
|
TARGET copy_assets
|
|
213
218
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bin/default.metallib ${METAL_LIB_TARGET_PATH}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.
|
|
4
|
+
"version": "1.3.4",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.4",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.4",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.4",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.4",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.4",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.4",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.4",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.4",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.4",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.4",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.4",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.4",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.4"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
|
|
2
|
-
index
|
|
2
|
+
index 706fa32ee..248459903 100644
|
|
3
3
|
--- a/src/llama.cpp/common/CMakeLists.txt
|
|
4
4
|
+++ b/src/llama.cpp/common/CMakeLists.txt
|
|
5
|
-
@@ -
|
|
5
|
+
@@ -141,9 +141,16 @@ if (LLAMA_LLGUIDANCE)
|
|
6
6
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
7
7
|
endif ()
|
|
8
8
|
|
|
@@ -85,10 +85,10 @@ index 50efb0d4e..f471a84c7 100644
|
|
|
85
85
|
struct common_chat_tool_call {
|
|
86
86
|
std::string name;
|
|
87
87
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
88
|
-
index
|
|
88
|
+
index 4dc95dcba..ea0ea86c0 100644
|
|
89
89
|
--- a/src/llama.cpp/common/common.cpp
|
|
90
90
|
+++ b/src/llama.cpp/common/common.cpp
|
|
91
|
-
@@ -
|
|
91
|
+
@@ -1155,6 +1155,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
92
92
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
93
93
|
}
|
|
94
94
|
|
|
@@ -109,7 +109,7 @@ index f42c083fa..c573cc812 100644
|
|
|
109
109
|
int32_t n_ctx = 4096; // context size
|
|
110
110
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
111
111
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
112
|
-
index
|
|
112
|
+
index e52e050a8..c1000c162 100644
|
|
113
113
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
114
114
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
115
115
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -9,10 +9,10 @@ Napi::Array TokenProbsToArray(Napi::Env env, llama_context* ctx, const std::vect
|
|
|
9
9
|
for (size_t i = 0; i < probs.size(); i++) {
|
|
10
10
|
const auto &prob = probs[i];
|
|
11
11
|
Napi::Object token_obj = Napi::Object::New(env);
|
|
12
|
-
|
|
12
|
+
|
|
13
13
|
std::string token_str = common_token_to_piece(ctx, prob.tok);
|
|
14
14
|
token_obj.Set("content", Napi::String::New(env, token_str));
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
Napi::Array token_probs = Napi::Array::New(env);
|
|
17
17
|
for (size_t j = 0; j < prob.probs.size(); j++) {
|
|
18
18
|
const auto &p = prob.probs[j];
|
|
@@ -83,10 +83,10 @@ void LlamaCompletionWorker::Execute() {
|
|
|
83
83
|
}
|
|
84
84
|
|
|
85
85
|
auto completion = _rn_ctx->completion;
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
// Prepare completion context
|
|
88
88
|
completion->rewind();
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
// Set up parameters
|
|
91
91
|
_rn_ctx->params.prompt = _params.prompt;
|
|
92
92
|
_rn_ctx->params.sampling = _params.sampling;
|
|
@@ -95,50 +95,50 @@ void LlamaCompletionWorker::Execute() {
|
|
|
95
95
|
_rn_ctx->params.n_ctx = _params.n_ctx;
|
|
96
96
|
_rn_ctx->params.n_batch = _params.n_batch;
|
|
97
97
|
_rn_ctx->params.ctx_shift = _params.ctx_shift;
|
|
98
|
-
|
|
98
|
+
|
|
99
99
|
// Set prefill text
|
|
100
100
|
completion->prefill_text = _prefill_text;
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
// Set up TTS guide tokens if enabled
|
|
103
103
|
if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
|
|
104
104
|
_rn_ctx->tts_wrapper->guide_tokens = _guide_tokens;
|
|
105
105
|
_rn_ctx->tts_wrapper->next_token_uses_guide_token = true;
|
|
106
106
|
}
|
|
107
|
-
|
|
107
|
+
|
|
108
108
|
// Initialize sampling
|
|
109
109
|
if (!completion->initSampling()) {
|
|
110
110
|
SetError("Failed to initialize sampling");
|
|
111
111
|
return;
|
|
112
112
|
}
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
// Load prompt (handles both text-only and multimodal)
|
|
115
115
|
completion->loadPrompt(_media_paths);
|
|
116
|
-
|
|
116
|
+
|
|
117
117
|
// Check if context is full after loading prompt
|
|
118
118
|
if (completion->context_full) {
|
|
119
119
|
_result.context_full = true;
|
|
120
120
|
return;
|
|
121
121
|
}
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
// Begin completion with chat format and reasoning settings
|
|
124
124
|
completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
// Main completion loop
|
|
127
127
|
int token_count = 0;
|
|
128
128
|
const int max_tokens = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
|
|
129
129
|
while (completion->has_next_token && !_interrupted && token_count < max_tokens) {
|
|
130
130
|
// Get next token using rn-llama completion
|
|
131
131
|
rnllama::completion_token_output token_output = completion->doCompletion();
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
if (token_output.tok == -1) {
|
|
134
134
|
break;
|
|
135
135
|
}
|
|
136
|
-
|
|
136
|
+
|
|
137
137
|
token_count++;
|
|
138
|
-
|
|
138
|
+
|
|
139
139
|
std::string token_text = common_token_to_piece(_rn_ctx->ctx, token_output.tok);
|
|
140
140
|
_result.text += token_text;
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
// Check for stopping strings after adding the token
|
|
143
143
|
if (!_stop_words.empty()) {
|
|
144
144
|
size_t stop_pos = completion->findStoppingStrings(_result.text, token_text.size(), rnllama::STOP_FULL);
|
|
@@ -148,7 +148,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
148
148
|
break;
|
|
149
149
|
}
|
|
150
150
|
}
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
// Handle streaming callback
|
|
153
153
|
if (_has_callback && !completion->incomplete) {
|
|
154
154
|
struct TokenData {
|
|
@@ -160,9 +160,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
160
160
|
std::vector<rnllama::completion_token_output> completion_probabilities;
|
|
161
161
|
llama_context* ctx;
|
|
162
162
|
};
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
auto partial_output = completion->parseChatOutput(true);
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
// Extract completion probabilities if n_probs > 0, similar to iOS implementation
|
|
167
167
|
std::vector<rnllama::completion_token_output> probs_output;
|
|
168
168
|
if (_rn_ctx->params.sampling.n_probs > 0) {
|
|
@@ -171,23 +171,23 @@ void LlamaCompletionWorker::Execute() {
|
|
|
171
171
|
size_t probs_stop_pos = std::min(_sent_token_probs_index + to_send_toks.size(), completion->generated_token_probs.size());
|
|
172
172
|
if (probs_pos < probs_stop_pos) {
|
|
173
173
|
probs_output = std::vector<rnllama::completion_token_output>(
|
|
174
|
-
completion->generated_token_probs.begin() + probs_pos,
|
|
174
|
+
completion->generated_token_probs.begin() + probs_pos,
|
|
175
175
|
completion->generated_token_probs.begin() + probs_stop_pos
|
|
176
176
|
);
|
|
177
177
|
}
|
|
178
178
|
_sent_token_probs_index = probs_stop_pos;
|
|
179
179
|
}
|
|
180
|
-
|
|
180
|
+
|
|
181
181
|
TokenData *token_data = new TokenData{
|
|
182
|
-
token_text,
|
|
183
|
-
partial_output.content,
|
|
184
|
-
partial_output.reasoning_content,
|
|
185
|
-
partial_output.tool_calls,
|
|
182
|
+
token_text,
|
|
183
|
+
partial_output.content,
|
|
184
|
+
partial_output.reasoning_content,
|
|
185
|
+
partial_output.tool_calls,
|
|
186
186
|
partial_output.accumulated_text,
|
|
187
187
|
probs_output,
|
|
188
188
|
_rn_ctx->ctx
|
|
189
189
|
};
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
_tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
|
|
192
192
|
TokenData *data) {
|
|
193
193
|
auto obj = Napi::Object::New(env);
|
|
@@ -216,25 +216,25 @@ void LlamaCompletionWorker::Execute() {
|
|
|
216
216
|
obj.Set("tool_calls", tool_calls);
|
|
217
217
|
}
|
|
218
218
|
obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
|
|
219
|
-
|
|
219
|
+
|
|
220
220
|
// Add completion_probabilities if available
|
|
221
221
|
if (!data->completion_probabilities.empty()) {
|
|
222
222
|
obj.Set("completion_probabilities", TokenProbsToArray(env, data->ctx, data->completion_probabilities));
|
|
223
223
|
}
|
|
224
|
-
|
|
224
|
+
|
|
225
225
|
delete data;
|
|
226
226
|
jsCallback.Call({obj});
|
|
227
227
|
});
|
|
228
228
|
}
|
|
229
229
|
}
|
|
230
|
-
|
|
230
|
+
|
|
231
231
|
// Check stopping conditions
|
|
232
232
|
if (token_count >= max_tokens) {
|
|
233
233
|
_result.stopped_limited = true;
|
|
234
234
|
} else if (!completion->has_next_token && completion->n_remain == 0) {
|
|
235
235
|
_result.stopped_limited = true;
|
|
236
236
|
}
|
|
237
|
-
|
|
237
|
+
|
|
238
238
|
// Set completion results from rn-llama completion context
|
|
239
239
|
// tokens_evaluated should include both prompt tokens and generated tokens that were processed
|
|
240
240
|
_result.tokens_evaluated = completion->num_prompt_tokens + completion->num_tokens_predicted;
|
|
@@ -245,20 +245,20 @@ void LlamaCompletionWorker::Execute() {
|
|
|
245
245
|
_result.stopped_words = completion->stopped_word;
|
|
246
246
|
_result.stopping_word = completion->stopping_word;
|
|
247
247
|
_result.stopped_limited = completion->stopped_limit;
|
|
248
|
-
|
|
248
|
+
|
|
249
249
|
// Get audio tokens if TTS is enabled
|
|
250
250
|
if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
|
|
251
251
|
_result.audio_tokens = _rn_ctx->tts_wrapper->audio_tokens;
|
|
252
252
|
}
|
|
253
|
-
|
|
253
|
+
common_perf_print(_rn_ctx->ctx, _rn_ctx->completion->ctx_sampling);
|
|
254
254
|
// End completion
|
|
255
255
|
completion->endCompletion();
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
} catch (const std::exception &e) {
|
|
258
258
|
SetError(e.what());
|
|
259
259
|
return;
|
|
260
260
|
}
|
|
261
|
-
|
|
261
|
+
|
|
262
262
|
if (_onComplete) {
|
|
263
263
|
_onComplete();
|
|
264
264
|
}
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -376,6 +376,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
376
376
|
_rn_ctx = nullptr;
|
|
377
377
|
Napi::TypeError::New(env, "Failed to load model").ThrowAsJavaScriptException();
|
|
378
378
|
}
|
|
379
|
+
_rn_ctx->attachThreadpoolsIfAvailable();
|
|
379
380
|
|
|
380
381
|
// Release progress callback after model is loaded
|
|
381
382
|
if (has_progress_callback) {
|
|
@@ -386,7 +387,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
386
387
|
if (!lora.empty()) {
|
|
387
388
|
_rn_ctx->applyLoraAdapters(lora);
|
|
388
389
|
}
|
|
389
|
-
|
|
390
|
+
|
|
390
391
|
_info = common_params_get_system_info(params);
|
|
391
392
|
}
|
|
392
393
|
|
|
@@ -636,7 +637,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
636
637
|
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
637
638
|
auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
|
|
638
639
|
auto now_str = get_option<std::string>(params, "now", "");
|
|
639
|
-
|
|
640
|
+
|
|
640
641
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
641
642
|
if (params.Has("chat_template_kwargs") && params.Get("chat_template_kwargs").IsObject()) {
|
|
642
643
|
auto kwargs_obj = params.Get("chat_template_kwargs").As<Napi::Object>();
|
|
@@ -873,7 +874,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
873
874
|
auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
|
|
874
875
|
auto add_generation_prompt = get_option<bool>(options, "add_generation_prompt", true);
|
|
875
876
|
auto now_str = get_option<std::string>(options, "now", "");
|
|
876
|
-
|
|
877
|
+
|
|
877
878
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
878
879
|
if (options.Has("chat_template_kwargs") && options.Get("chat_template_kwargs").IsObject()) {
|
|
879
880
|
auto kwargs_obj = options.Get("chat_template_kwargs").As<Napi::Object>();
|
|
@@ -886,7 +887,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
886
887
|
}
|
|
887
888
|
|
|
888
889
|
common_chat_params chatParams;
|
|
889
|
-
|
|
890
|
+
|
|
890
891
|
try {
|
|
891
892
|
chatParams = _rn_ctx->getFormattedChatWithJinja(
|
|
892
893
|
json_stringify(messages), chat_template,
|
|
@@ -1043,7 +1044,7 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
|
|
|
1043
1044
|
}
|
|
1044
1045
|
auto text = info[0].ToString().Utf8Value();
|
|
1045
1046
|
std::vector<std::string> media_paths;
|
|
1046
|
-
|
|
1047
|
+
|
|
1047
1048
|
if (info.Length() >= 2 && info[1].IsArray()) {
|
|
1048
1049
|
// Direct array format: tokenize(text, [media_paths])
|
|
1049
1050
|
auto media_paths_array = info[1].As<Napi::Array>();
|
|
@@ -1051,7 +1052,7 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
|
|
|
1051
1052
|
media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
|
|
1052
1053
|
}
|
|
1053
1054
|
}
|
|
1054
|
-
|
|
1055
|
+
|
|
1055
1056
|
auto *worker = new TokenizeWorker(info, _rn_ctx, text, media_paths);
|
|
1056
1057
|
worker->Queue();
|
|
1057
1058
|
return worker->Promise();
|
|
@@ -1072,7 +1073,7 @@ Napi::Value LlamaContext::Detokenize(const Napi::CallbackInfo &info) {
|
|
|
1072
1073
|
for (size_t i = 0; i < tokens.Length(); i++) {
|
|
1073
1074
|
token_ids.push_back(tokens.Get(i).ToNumber().Int32Value());
|
|
1074
1075
|
}
|
|
1075
|
-
|
|
1076
|
+
|
|
1076
1077
|
auto *worker = new DetokenizeWorker(info, _rn_ctx, token_ids);
|
|
1077
1078
|
worker->Queue();
|
|
1078
1079
|
return worker->Promise();
|
|
@@ -1112,16 +1113,16 @@ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
|
|
|
1112
1113
|
Napi::TypeError::New(env, "Context is disposed")
|
|
1113
1114
|
.ThrowAsJavaScriptException();
|
|
1114
1115
|
}
|
|
1115
|
-
|
|
1116
|
+
|
|
1116
1117
|
auto query = info[0].ToString().Utf8Value();
|
|
1117
1118
|
auto documents_array = info[1].As<Napi::Array>();
|
|
1118
|
-
|
|
1119
|
+
|
|
1119
1120
|
// Convert documents array to vector
|
|
1120
1121
|
std::vector<std::string> documents;
|
|
1121
1122
|
for (size_t i = 0; i < documents_array.Length(); i++) {
|
|
1122
1123
|
documents.push_back(documents_array.Get(i).ToString().Utf8Value());
|
|
1123
1124
|
}
|
|
1124
|
-
|
|
1125
|
+
|
|
1125
1126
|
auto options = Napi::Object::New(env);
|
|
1126
1127
|
if (info.Length() >= 3 && info[2].IsObject()) {
|
|
1127
1128
|
options = info[2].As<Napi::Object>();
|
|
@@ -1130,7 +1131,7 @@ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
|
|
|
1130
1131
|
common_params rerankParams;
|
|
1131
1132
|
rerankParams.embedding = true;
|
|
1132
1133
|
rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
|
|
1133
|
-
|
|
1134
|
+
|
|
1134
1135
|
auto *worker = new RerankWorker(info, _rn_ctx, query, documents, rerankParams);
|
|
1135
1136
|
worker->Queue();
|
|
1136
1137
|
return worker->Promise();
|
|
@@ -1379,13 +1380,13 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
|
|
|
1379
1380
|
}
|
|
1380
1381
|
auto text = info[1].ToString().Utf8Value();
|
|
1381
1382
|
auto speaker_json = info[0].IsString() ? info[0].ToString().Utf8Value() : "";
|
|
1382
|
-
|
|
1383
|
+
|
|
1383
1384
|
if (!_rn_ctx->tts_wrapper) {
|
|
1384
1385
|
Napi::Error::New(env, "Vocoder not initialized")
|
|
1385
1386
|
.ThrowAsJavaScriptException();
|
|
1386
1387
|
return env.Undefined();
|
|
1387
1388
|
}
|
|
1388
|
-
|
|
1389
|
+
|
|
1389
1390
|
auto result_data = _rn_ctx->tts_wrapper->getFormattedAudioCompletion(_rn_ctx, speaker_json, text);
|
|
1390
1391
|
Napi::Object result = Napi::Object::New(env);
|
|
1391
1392
|
result.Set("prompt", Napi::String::New(env, result_data.prompt));
|
|
@@ -1406,13 +1407,13 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
|
|
|
1406
1407
|
return env.Undefined();
|
|
1407
1408
|
}
|
|
1408
1409
|
auto text = info[0].ToString().Utf8Value();
|
|
1409
|
-
|
|
1410
|
+
|
|
1410
1411
|
if (!_rn_ctx->tts_wrapper) {
|
|
1411
1412
|
Napi::Error::New(env, "Vocoder not initialized")
|
|
1412
1413
|
.ThrowAsJavaScriptException();
|
|
1413
1414
|
return env.Undefined();
|
|
1414
1415
|
}
|
|
1415
|
-
|
|
1416
|
+
|
|
1416
1417
|
auto result = _rn_ctx->tts_wrapper->getAudioCompletionGuideTokens(_rn_ctx, text);
|
|
1417
1418
|
auto tokens = Napi::Int32Array::New(env, result.size());
|
|
1418
1419
|
memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
|
|
@@ -1448,7 +1449,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
|
|
|
1448
1449
|
.ThrowAsJavaScriptException();
|
|
1449
1450
|
return env.Undefined();
|
|
1450
1451
|
}
|
|
1451
|
-
|
|
1452
|
+
|
|
1452
1453
|
auto *worker = new DecodeAudioTokenWorker(info, _rn_ctx, tokens);
|
|
1453
1454
|
worker->Queue();
|
|
1454
1455
|
return worker->Promise();
|
|
@@ -92,6 +92,7 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
|
|
|
92
92
|
|
|
93
93
|
# 3rd party libs
|
|
94
94
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
|
95
|
+
option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
|
|
95
96
|
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
|
|
96
97
|
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
|
97
98
|
|
|
@@ -200,6 +201,9 @@ endif()
|
|
|
200
201
|
|
|
201
202
|
if (LLAMA_BUILD_COMMON)
|
|
202
203
|
add_subdirectory(common)
|
|
204
|
+
if (LLAMA_HTTPLIB)
|
|
205
|
+
add_subdirectory(vendor/cpp-httplib)
|
|
206
|
+
endif()
|
|
203
207
|
endif()
|
|
204
208
|
|
|
205
209
|
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
|
@@ -79,10 +79,11 @@ if (BUILD_SHARED_LIBS)
|
|
|
79
79
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
80
80
|
endif()
|
|
81
81
|
|
|
82
|
+
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
|
|
82
83
|
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
|
83
84
|
|
|
84
|
-
# Use curl to download model url
|
|
85
85
|
if (LLAMA_CURL)
|
|
86
|
+
# Use curl to download model url
|
|
86
87
|
find_package(CURL)
|
|
87
88
|
if (NOT CURL_FOUND)
|
|
88
89
|
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
|
|
@@ -90,42 +91,10 @@ if (LLAMA_CURL)
|
|
|
90
91
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
91
92
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
92
93
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if (OpenSSL_FOUND)
|
|
98
|
-
include(CheckCSourceCompiles)
|
|
99
|
-
set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
|
100
|
-
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
|
|
101
|
-
check_c_source_compiles("
|
|
102
|
-
#include <openssl/opensslv.h>
|
|
103
|
-
#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
|
|
104
|
-
# if OPENSSL_VERSION_NUMBER < 0x1010107f
|
|
105
|
-
# error bad version
|
|
106
|
-
# endif
|
|
107
|
-
#else
|
|
108
|
-
# if OPENSSL_VERSION_NUMBER < 0x30000000L
|
|
109
|
-
# error bad version
|
|
110
|
-
# endif
|
|
111
|
-
#endif
|
|
112
|
-
int main() { return 0; }
|
|
113
|
-
" OPENSSL_VERSION_SUPPORTED)
|
|
114
|
-
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
|
|
115
|
-
if (OPENSSL_VERSION_SUPPORTED)
|
|
116
|
-
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
|
|
117
|
-
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
|
|
118
|
-
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
|
119
|
-
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
|
120
|
-
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
|
121
|
-
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
|
|
122
|
-
find_library(SECURITY_FRAMEWORK Security REQUIRED)
|
|
123
|
-
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
|
|
124
|
-
endif()
|
|
125
|
-
endif()
|
|
126
|
-
else()
|
|
127
|
-
message(STATUS "OpenSSL not found, SSL support disabled")
|
|
128
|
-
endif()
|
|
94
|
+
elseif (LLAMA_HTTPLIB)
|
|
95
|
+
# otherwise, use cpp-httplib
|
|
96
|
+
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
|
|
97
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
|
|
129
98
|
endif()
|
|
130
99
|
|
|
131
100
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -355,11 +355,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|
|
355
355
|
}
|
|
356
356
|
|
|
357
357
|
void common_init() {
|
|
358
|
-
llama_log_set(
|
|
359
|
-
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
|
360
|
-
common_log_add(common_log_main(), level, "%s", text);
|
|
361
|
-
}
|
|
362
|
-
}, NULL);
|
|
358
|
+
llama_log_set(common_log_default_callback, NULL);
|
|
363
359
|
|
|
364
360
|
#ifdef NDEBUG
|
|
365
361
|
const char * build_type = "";
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
#if defined(LLAMA_USE_CURL)
|
|
21
21
|
#include <curl/curl.h>
|
|
22
22
|
#include <curl/easy.h>
|
|
23
|
-
#
|
|
23
|
+
#elif defined(LLAMA_USE_HTTPLIB)
|
|
24
24
|
#include "http.h"
|
|
25
25
|
#endif
|
|
26
26
|
|
|
@@ -467,7 +467,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
467
467
|
return { res_code, std::move(res_buffer) };
|
|
468
468
|
}
|
|
469
469
|
|
|
470
|
-
#
|
|
470
|
+
#elif defined(LLAMA_USE_HTTPLIB)
|
|
471
471
|
|
|
472
472
|
static bool is_output_a_tty() {
|
|
473
473
|
#if defined(_WIN32)
|
|
@@ -713,6 +713,8 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
|
|
713
713
|
|
|
714
714
|
#endif // LLAMA_USE_CURL
|
|
715
715
|
|
|
716
|
+
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
|
|
717
|
+
|
|
716
718
|
static bool common_download_file_single(const std::string & url,
|
|
717
719
|
const std::string & path,
|
|
718
720
|
const std::string & bearer_token,
|
|
@@ -907,33 +909,6 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
|
|
|
907
909
|
return { hf_repo, ggufFile, mmprojFile };
|
|
908
910
|
}
|
|
909
911
|
|
|
910
|
-
std::vector<common_cached_model_info> common_list_cached_models() {
|
|
911
|
-
std::vector<common_cached_model_info> models;
|
|
912
|
-
const std::string cache_dir = fs_get_cache_directory();
|
|
913
|
-
const std::vector<common_file_info> files = fs_list_files(cache_dir);
|
|
914
|
-
for (const auto & file : files) {
|
|
915
|
-
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
|
|
916
|
-
common_cached_model_info model_info;
|
|
917
|
-
model_info.manifest_path = file.path;
|
|
918
|
-
std::string fname = file.name;
|
|
919
|
-
string_replace_all(fname, ".json", ""); // remove extension
|
|
920
|
-
auto parts = string_split<std::string>(fname, '=');
|
|
921
|
-
if (parts.size() == 4) {
|
|
922
|
-
// expect format: manifest=<user>=<model>=<tag>=<other>
|
|
923
|
-
model_info.user = parts[1];
|
|
924
|
-
model_info.model = parts[2];
|
|
925
|
-
model_info.tag = parts[3];
|
|
926
|
-
} else {
|
|
927
|
-
// invalid format
|
|
928
|
-
continue;
|
|
929
|
-
}
|
|
930
|
-
model_info.size = 0; // TODO: get GGUF size, not manifest size
|
|
931
|
-
models.push_back(model_info);
|
|
932
|
-
}
|
|
933
|
-
}
|
|
934
|
-
return models;
|
|
935
|
-
}
|
|
936
|
-
|
|
937
912
|
//
|
|
938
913
|
// Docker registry functions
|
|
939
914
|
//
|
|
@@ -1052,3 +1027,46 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
|
|
1052
1027
|
throw;
|
|
1053
1028
|
}
|
|
1054
1029
|
}
|
|
1030
|
+
|
|
1031
|
+
#else
|
|
1032
|
+
|
|
1033
|
+
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
|
1034
|
+
throw std::runtime_error("download functionality is not enabled in this build");
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
bool common_download_model(const common_params_model &, const std::string &, bool) {
|
|
1038
|
+
throw std::runtime_error("download functionality is not enabled in this build");
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
std::string common_docker_resolve_model(const std::string &) {
|
|
1042
|
+
throw std::runtime_error("download functionality is not enabled in this build");
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
|
|
1046
|
+
|
|
1047
|
+
std::vector<common_cached_model_info> common_list_cached_models() {
|
|
1048
|
+
std::vector<common_cached_model_info> models;
|
|
1049
|
+
const std::string cache_dir = fs_get_cache_directory();
|
|
1050
|
+
const std::vector<common_file_info> files = fs_list_files(cache_dir);
|
|
1051
|
+
for (const auto & file : files) {
|
|
1052
|
+
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
|
|
1053
|
+
common_cached_model_info model_info;
|
|
1054
|
+
model_info.manifest_path = file.path;
|
|
1055
|
+
std::string fname = file.name;
|
|
1056
|
+
string_replace_all(fname, ".json", ""); // remove extension
|
|
1057
|
+
auto parts = string_split<std::string>(fname, '=');
|
|
1058
|
+
if (parts.size() == 4) {
|
|
1059
|
+
// expect format: manifest=<user>=<model>=<tag>=<other>
|
|
1060
|
+
model_info.user = parts[1];
|
|
1061
|
+
model_info.model = parts[2];
|
|
1062
|
+
model_info.tag = parts[3];
|
|
1063
|
+
} else {
|
|
1064
|
+
// invalid format
|
|
1065
|
+
continue;
|
|
1066
|
+
}
|
|
1067
|
+
model_info.size = 0; // TODO: get GGUF size, not manifest size
|
|
1068
|
+
models.push_back(model_info);
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
return models;
|
|
1072
|
+
}
|
|
@@ -442,3 +442,9 @@ void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
|
442
442
|
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|
443
443
|
log->set_timestamps(timestamps);
|
|
444
444
|
}
|
|
445
|
+
|
|
446
|
+
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
|
|
447
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
|
448
|
+
common_log_add(common_log_main(), level, "%s", text);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
@@ -36,6 +36,8 @@ extern int common_log_verbosity_thold;
|
|
|
36
36
|
|
|
37
37
|
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
|
38
38
|
|
|
39
|
+
void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
|
|
40
|
+
|
|
39
41
|
// the common_log uses an internal worker thread to print/write log messages
|
|
40
42
|
// when the worker thread is paused, incoming log messages are discarded
|
|
41
43
|
struct common_log;
|