@fugood/llama.node 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -120,16 +120,20 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DEFINED GGML_OPENMP OR GGML_O
120
120
  endif()
121
121
 
122
122
  set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
123
-
124
- set(LLAMA_BUILD_TOOLS ON CACHE BOOL "Build tools")
125
-
123
+ set(LLAMA_BUILD_TOOLS OFF CACHE BOOL "Build tools")
124
+ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "Build tests")
125
+ set(LLAMA_BUILD_SERVER OFF CACHE BOOL "Build server")
126
+ set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "Build examples")
126
127
  set(LLAMA_CURL OFF CACHE BOOL "Build curl")
127
128
 
129
+ set(LLAMA_INSTALL_VERSION "0.0.0") # TODO: Set the version number (0.0.<BUILD_NUMBER>)
130
+
128
131
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
129
132
 
130
133
  add_definitions(-DGGML_MAX_NAME=80)
131
134
 
132
135
  add_subdirectory("src/llama.cpp")
136
+ add_subdirectory("src/llama.cpp/tools/mtmd")
133
137
 
134
138
  include_directories(
135
139
  ${CMAKE_JS_INC}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.3",
4
+ "version": "1.3.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.3",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.3",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.3",
78
- "@fugood/node-llama-linux-arm64": "1.3.3",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.3",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.3",
81
- "@fugood/node-llama-win32-x64": "1.3.3",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.3",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.3",
84
- "@fugood/node-llama-win32-arm64": "1.3.3",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.3",
86
- "@fugood/node-llama-darwin-x64": "1.3.3",
87
- "@fugood/node-llama-darwin-arm64": "1.3.3"
75
+ "@fugood/node-llama-linux-x64": "1.3.4",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.4",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.4",
78
+ "@fugood/node-llama-linux-arm64": "1.3.4",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.4",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.4",
81
+ "@fugood/node-llama-win32-x64": "1.3.4",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.4",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.4",
84
+ "@fugood/node-llama-win32-arm64": "1.3.4",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.4",
86
+ "@fugood/node-llama-darwin-x64": "1.3.4",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.4"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -9,10 +9,10 @@ Napi::Array TokenProbsToArray(Napi::Env env, llama_context* ctx, const std::vect
9
9
  for (size_t i = 0; i < probs.size(); i++) {
10
10
  const auto &prob = probs[i];
11
11
  Napi::Object token_obj = Napi::Object::New(env);
12
-
12
+
13
13
  std::string token_str = common_token_to_piece(ctx, prob.tok);
14
14
  token_obj.Set("content", Napi::String::New(env, token_str));
15
-
15
+
16
16
  Napi::Array token_probs = Napi::Array::New(env);
17
17
  for (size_t j = 0; j < prob.probs.size(); j++) {
18
18
  const auto &p = prob.probs[j];
@@ -83,10 +83,10 @@ void LlamaCompletionWorker::Execute() {
83
83
  }
84
84
 
85
85
  auto completion = _rn_ctx->completion;
86
-
86
+
87
87
  // Prepare completion context
88
88
  completion->rewind();
89
-
89
+
90
90
  // Set up parameters
91
91
  _rn_ctx->params.prompt = _params.prompt;
92
92
  _rn_ctx->params.sampling = _params.sampling;
@@ -95,50 +95,50 @@ void LlamaCompletionWorker::Execute() {
95
95
  _rn_ctx->params.n_ctx = _params.n_ctx;
96
96
  _rn_ctx->params.n_batch = _params.n_batch;
97
97
  _rn_ctx->params.ctx_shift = _params.ctx_shift;
98
-
98
+
99
99
  // Set prefill text
100
100
  completion->prefill_text = _prefill_text;
101
-
101
+
102
102
  // Set up TTS guide tokens if enabled
103
103
  if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
104
104
  _rn_ctx->tts_wrapper->guide_tokens = _guide_tokens;
105
105
  _rn_ctx->tts_wrapper->next_token_uses_guide_token = true;
106
106
  }
107
-
107
+
108
108
  // Initialize sampling
109
109
  if (!completion->initSampling()) {
110
110
  SetError("Failed to initialize sampling");
111
111
  return;
112
112
  }
113
-
113
+
114
114
  // Load prompt (handles both text-only and multimodal)
115
115
  completion->loadPrompt(_media_paths);
116
-
116
+
117
117
  // Check if context is full after loading prompt
118
118
  if (completion->context_full) {
119
119
  _result.context_full = true;
120
120
  return;
121
121
  }
122
-
122
+
123
123
  // Begin completion with chat format and reasoning settings
124
124
  completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
125
-
125
+
126
126
  // Main completion loop
127
127
  int token_count = 0;
128
128
  const int max_tokens = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
129
129
  while (completion->has_next_token && !_interrupted && token_count < max_tokens) {
130
130
  // Get next token using rn-llama completion
131
131
  rnllama::completion_token_output token_output = completion->doCompletion();
132
-
132
+
133
133
  if (token_output.tok == -1) {
134
134
  break;
135
135
  }
136
-
136
+
137
137
  token_count++;
138
-
138
+
139
139
  std::string token_text = common_token_to_piece(_rn_ctx->ctx, token_output.tok);
140
140
  _result.text += token_text;
141
-
141
+
142
142
  // Check for stopping strings after adding the token
143
143
  if (!_stop_words.empty()) {
144
144
  size_t stop_pos = completion->findStoppingStrings(_result.text, token_text.size(), rnllama::STOP_FULL);
@@ -148,7 +148,7 @@ void LlamaCompletionWorker::Execute() {
148
148
  break;
149
149
  }
150
150
  }
151
-
151
+
152
152
  // Handle streaming callback
153
153
  if (_has_callback && !completion->incomplete) {
154
154
  struct TokenData {
@@ -160,9 +160,9 @@ void LlamaCompletionWorker::Execute() {
160
160
  std::vector<rnllama::completion_token_output> completion_probabilities;
161
161
  llama_context* ctx;
162
162
  };
163
-
163
+
164
164
  auto partial_output = completion->parseChatOutput(true);
165
-
165
+
166
166
  // Extract completion probabilities if n_probs > 0, similar to iOS implementation
167
167
  std::vector<rnllama::completion_token_output> probs_output;
168
168
  if (_rn_ctx->params.sampling.n_probs > 0) {
@@ -171,23 +171,23 @@ void LlamaCompletionWorker::Execute() {
171
171
  size_t probs_stop_pos = std::min(_sent_token_probs_index + to_send_toks.size(), completion->generated_token_probs.size());
172
172
  if (probs_pos < probs_stop_pos) {
173
173
  probs_output = std::vector<rnllama::completion_token_output>(
174
- completion->generated_token_probs.begin() + probs_pos,
174
+ completion->generated_token_probs.begin() + probs_pos,
175
175
  completion->generated_token_probs.begin() + probs_stop_pos
176
176
  );
177
177
  }
178
178
  _sent_token_probs_index = probs_stop_pos;
179
179
  }
180
-
180
+
181
181
  TokenData *token_data = new TokenData{
182
- token_text,
183
- partial_output.content,
184
- partial_output.reasoning_content,
185
- partial_output.tool_calls,
182
+ token_text,
183
+ partial_output.content,
184
+ partial_output.reasoning_content,
185
+ partial_output.tool_calls,
186
186
  partial_output.accumulated_text,
187
187
  probs_output,
188
188
  _rn_ctx->ctx
189
189
  };
190
-
190
+
191
191
  _tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
192
192
  TokenData *data) {
193
193
  auto obj = Napi::Object::New(env);
@@ -216,25 +216,25 @@ void LlamaCompletionWorker::Execute() {
216
216
  obj.Set("tool_calls", tool_calls);
217
217
  }
218
218
  obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
219
-
219
+
220
220
  // Add completion_probabilities if available
221
221
  if (!data->completion_probabilities.empty()) {
222
222
  obj.Set("completion_probabilities", TokenProbsToArray(env, data->ctx, data->completion_probabilities));
223
223
  }
224
-
224
+
225
225
  delete data;
226
226
  jsCallback.Call({obj});
227
227
  });
228
228
  }
229
229
  }
230
-
230
+
231
231
  // Check stopping conditions
232
232
  if (token_count >= max_tokens) {
233
233
  _result.stopped_limited = true;
234
234
  } else if (!completion->has_next_token && completion->n_remain == 0) {
235
235
  _result.stopped_limited = true;
236
236
  }
237
-
237
+
238
238
  // Set completion results from rn-llama completion context
239
239
  // tokens_evaluated should include both prompt tokens and generated tokens that were processed
240
240
  _result.tokens_evaluated = completion->num_prompt_tokens + completion->num_tokens_predicted;
@@ -245,20 +245,20 @@ void LlamaCompletionWorker::Execute() {
245
245
  _result.stopped_words = completion->stopped_word;
246
246
  _result.stopping_word = completion->stopping_word;
247
247
  _result.stopped_limited = completion->stopped_limit;
248
-
248
+
249
249
  // Get audio tokens if TTS is enabled
250
250
  if (_has_vocoder && _rn_ctx->tts_wrapper != nullptr) {
251
251
  _result.audio_tokens = _rn_ctx->tts_wrapper->audio_tokens;
252
252
  }
253
-
253
+ common_perf_print(_rn_ctx->ctx, _rn_ctx->completion->ctx_sampling);
254
254
  // End completion
255
255
  completion->endCompletion();
256
-
256
+
257
257
  } catch (const std::exception &e) {
258
258
  SetError(e.what());
259
259
  return;
260
260
  }
261
-
261
+
262
262
  if (_onComplete) {
263
263
  _onComplete();
264
264
  }
@@ -376,6 +376,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
376
376
  _rn_ctx = nullptr;
377
377
  Napi::TypeError::New(env, "Failed to load model").ThrowAsJavaScriptException();
378
378
  }
379
+ _rn_ctx->attachThreadpoolsIfAvailable();
379
380
 
380
381
  // Release progress callback after model is loaded
381
382
  if (has_progress_callback) {
@@ -386,7 +387,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
386
387
  if (!lora.empty()) {
387
388
  _rn_ctx->applyLoraAdapters(lora);
388
389
  }
389
-
390
+
390
391
  _info = common_params_get_system_info(params);
391
392
  }
392
393
 
@@ -636,7 +637,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
636
637
  auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
637
638
  auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
638
639
  auto now_str = get_option<std::string>(params, "now", "");
639
-
640
+
640
641
  std::map<std::string, std::string> chat_template_kwargs;
641
642
  if (params.Has("chat_template_kwargs") && params.Get("chat_template_kwargs").IsObject()) {
642
643
  auto kwargs_obj = params.Get("chat_template_kwargs").As<Napi::Object>();
@@ -873,7 +874,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
873
874
  auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
874
875
  auto add_generation_prompt = get_option<bool>(options, "add_generation_prompt", true);
875
876
  auto now_str = get_option<std::string>(options, "now", "");
876
-
877
+
877
878
  std::map<std::string, std::string> chat_template_kwargs;
878
879
  if (options.Has("chat_template_kwargs") && options.Get("chat_template_kwargs").IsObject()) {
879
880
  auto kwargs_obj = options.Get("chat_template_kwargs").As<Napi::Object>();
@@ -886,7 +887,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
886
887
  }
887
888
 
888
889
  common_chat_params chatParams;
889
-
890
+
890
891
  try {
891
892
  chatParams = _rn_ctx->getFormattedChatWithJinja(
892
893
  json_stringify(messages), chat_template,
@@ -1043,7 +1044,7 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
1043
1044
  }
1044
1045
  auto text = info[0].ToString().Utf8Value();
1045
1046
  std::vector<std::string> media_paths;
1046
-
1047
+
1047
1048
  if (info.Length() >= 2 && info[1].IsArray()) {
1048
1049
  // Direct array format: tokenize(text, [media_paths])
1049
1050
  auto media_paths_array = info[1].As<Napi::Array>();
@@ -1051,7 +1052,7 @@ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
1051
1052
  media_paths.push_back(media_paths_array.Get(i).ToString().Utf8Value());
1052
1053
  }
1053
1054
  }
1054
-
1055
+
1055
1056
  auto *worker = new TokenizeWorker(info, _rn_ctx, text, media_paths);
1056
1057
  worker->Queue();
1057
1058
  return worker->Promise();
@@ -1072,7 +1073,7 @@ Napi::Value LlamaContext::Detokenize(const Napi::CallbackInfo &info) {
1072
1073
  for (size_t i = 0; i < tokens.Length(); i++) {
1073
1074
  token_ids.push_back(tokens.Get(i).ToNumber().Int32Value());
1074
1075
  }
1075
-
1076
+
1076
1077
  auto *worker = new DetokenizeWorker(info, _rn_ctx, token_ids);
1077
1078
  worker->Queue();
1078
1079
  return worker->Promise();
@@ -1112,16 +1113,16 @@ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
1112
1113
  Napi::TypeError::New(env, "Context is disposed")
1113
1114
  .ThrowAsJavaScriptException();
1114
1115
  }
1115
-
1116
+
1116
1117
  auto query = info[0].ToString().Utf8Value();
1117
1118
  auto documents_array = info[1].As<Napi::Array>();
1118
-
1119
+
1119
1120
  // Convert documents array to vector
1120
1121
  std::vector<std::string> documents;
1121
1122
  for (size_t i = 0; i < documents_array.Length(); i++) {
1122
1123
  documents.push_back(documents_array.Get(i).ToString().Utf8Value());
1123
1124
  }
1124
-
1125
+
1125
1126
  auto options = Napi::Object::New(env);
1126
1127
  if (info.Length() >= 3 && info[2].IsObject()) {
1127
1128
  options = info[2].As<Napi::Object>();
@@ -1130,7 +1131,7 @@ Napi::Value LlamaContext::Rerank(const Napi::CallbackInfo &info) {
1130
1131
  common_params rerankParams;
1131
1132
  rerankParams.embedding = true;
1132
1133
  rerankParams.embd_normalize = get_option<int32_t>(options, "normalize", -1);
1133
-
1134
+
1134
1135
  auto *worker = new RerankWorker(info, _rn_ctx, query, documents, rerankParams);
1135
1136
  worker->Queue();
1136
1137
  return worker->Promise();
@@ -1379,13 +1380,13 @@ LlamaContext::GetFormattedAudioCompletion(const Napi::CallbackInfo &info) {
1379
1380
  }
1380
1381
  auto text = info[1].ToString().Utf8Value();
1381
1382
  auto speaker_json = info[0].IsString() ? info[0].ToString().Utf8Value() : "";
1382
-
1383
+
1383
1384
  if (!_rn_ctx->tts_wrapper) {
1384
1385
  Napi::Error::New(env, "Vocoder not initialized")
1385
1386
  .ThrowAsJavaScriptException();
1386
1387
  return env.Undefined();
1387
1388
  }
1388
-
1389
+
1389
1390
  auto result_data = _rn_ctx->tts_wrapper->getFormattedAudioCompletion(_rn_ctx, speaker_json, text);
1390
1391
  Napi::Object result = Napi::Object::New(env);
1391
1392
  result.Set("prompt", Napi::String::New(env, result_data.prompt));
@@ -1406,13 +1407,13 @@ LlamaContext::GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info) {
1406
1407
  return env.Undefined();
1407
1408
  }
1408
1409
  auto text = info[0].ToString().Utf8Value();
1409
-
1410
+
1410
1411
  if (!_rn_ctx->tts_wrapper) {
1411
1412
  Napi::Error::New(env, "Vocoder not initialized")
1412
1413
  .ThrowAsJavaScriptException();
1413
1414
  return env.Undefined();
1414
1415
  }
1415
-
1416
+
1416
1417
  auto result = _rn_ctx->tts_wrapper->getAudioCompletionGuideTokens(_rn_ctx, text);
1417
1418
  auto tokens = Napi::Int32Array::New(env, result.size());
1418
1419
  memcpy(tokens.Data(), result.data(), result.size() * sizeof(int32_t));
@@ -1448,7 +1449,7 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
1448
1449
  .ThrowAsJavaScriptException();
1449
1450
  return env.Undefined();
1450
1451
  }
1451
-
1452
+
1452
1453
  auto *worker = new DecodeAudioTokenWorker(info, _rn_ctx, tokens);
1453
1454
  worker->Queue();
1454
1455
  return worker->Promise();