@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.4",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.4",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.4",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.4",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.4",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.4",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.4",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.4",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.4",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.4",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.4",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.4",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.4",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.4"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
|
|
86
86
|
endif()
|
|
87
87
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
88
88
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
89
|
-
|
|
90
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
89
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
91
90
|
endif ()
|
|
92
91
|
|
|
93
92
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
|
|
112
111
|
|
|
113
112
|
ExternalProject_Add(llguidance_ext
|
|
114
113
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
115
|
-
#
|
|
116
|
-
GIT_TAG
|
|
114
|
+
# v1.0.1:
|
|
115
|
+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
|
117
116
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
118
117
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
119
118
|
BUILD_IN_SOURCE TRUE
|
|
120
119
|
CONFIGURE_COMMAND ""
|
|
121
|
-
BUILD_COMMAND cargo build --release
|
|
120
|
+
BUILD_COMMAND cargo build --release --package llguidance
|
|
122
121
|
INSTALL_COMMAND ""
|
|
123
122
|
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
124
123
|
UPDATE_COMMAND ""
|
|
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1464
1464
|
params.swa_full = true;
|
|
1465
1465
|
}
|
|
1466
1466
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1467
|
+
add_opt(common_arg(
|
|
1468
|
+
{"--kv-unified", "-kvu"},
|
|
1469
|
+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
1470
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1471
|
+
[](common_params & params) {
|
|
1472
|
+
params.kv_unified = true;
|
|
1473
|
+
}
|
|
1474
|
+
).set_env("LLAMA_ARG_KV_SPLIT"));
|
|
1467
1475
|
add_opt(common_arg(
|
|
1468
1476
|
{"--no-context-shift"},
|
|
1469
1477
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
@@ -3423,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3423
3431
|
}
|
|
3424
3432
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3425
3433
|
|
|
3434
|
+
// diffusion parameters
|
|
3435
|
+
add_opt(common_arg(
|
|
3436
|
+
{ "--diffusion-steps" }, "N",
|
|
3437
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3438
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3439
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3440
|
+
add_opt(common_arg(
|
|
3441
|
+
{ "--diffusion-eps" }, "F",
|
|
3442
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3443
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3444
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3445
|
+
add_opt(common_arg(
|
|
3446
|
+
{ "--diffusion-algorithm" }, "N",
|
|
3447
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
|
3448
|
+
params.diffusion.algorithm),
|
|
3449
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3450
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3451
|
+
add_opt(common_arg(
|
|
3452
|
+
{ "--diffusion-alg-temp" }, "F",
|
|
3453
|
+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3454
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3455
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3456
|
+
add_opt(common_arg(
|
|
3457
|
+
{ "--diffusion-visual" },
|
|
3458
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3459
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3460
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3461
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3462
|
+
|
|
3426
3463
|
return ctx_arg;
|
|
3427
3464
|
}
|
|
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
448
448
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
|
449
449
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
450
450
|
}
|
|
451
|
+
|
|
452
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
|
453
|
+
bool has_suffix = string_ends_with(str, suffix);
|
|
454
|
+
if (has_suffix) {
|
|
455
|
+
str = str.substr(0, str.size() - suffix.size());
|
|
456
|
+
}
|
|
457
|
+
return has_suffix;
|
|
458
|
+
}
|
|
459
|
+
|
|
451
460
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
|
452
461
|
if (!str.empty() && !stop.empty()) {
|
|
453
462
|
const char text_last_char = str.back();
|
|
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1005
1014
|
params.sampling.ignore_eos = false;
|
|
1006
1015
|
}
|
|
1007
1016
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
}
|
|
1017
|
+
// initialize once
|
|
1018
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1019
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
1020
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
1021
|
+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1014
1022
|
}
|
|
1015
1023
|
}
|
|
1016
1024
|
|
|
1025
|
+
if (params.sampling.ignore_eos) {
|
|
1026
|
+
// add EOG biases to the active set of logit biases
|
|
1027
|
+
params.sampling.logit_bias.insert(
|
|
1028
|
+
params.sampling.logit_bias.end(),
|
|
1029
|
+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1017
1032
|
if (params.sampling.penalty_last_n == -1) {
|
|
1018
1033
|
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1019
1034
|
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
@@ -1158,6 +1173,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1158
1173
|
cparams.no_perf = params.no_perf;
|
|
1159
1174
|
cparams.op_offload = !params.no_op_offload;
|
|
1160
1175
|
cparams.swa_full = params.swa_full;
|
|
1176
|
+
cparams.kv_unified = params.kv_unified;
|
|
1161
1177
|
|
|
1162
1178
|
cparams.type_k = params.cache_type_k;
|
|
1163
1179
|
cparams.type_v = params.cache_type_v;
|
|
@@ -81,6 +81,7 @@ enum llama_example {
|
|
|
81
81
|
LLAMA_EXAMPLE_LOOKUP,
|
|
82
82
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
83
|
LLAMA_EXAMPLE_TTS,
|
|
84
|
+
LLAMA_EXAMPLE_DIFFUSION,
|
|
84
85
|
|
|
85
86
|
LLAMA_EXAMPLE_COUNT,
|
|
86
87
|
};
|
|
@@ -177,7 +178,8 @@ struct common_params_sampling {
|
|
|
177
178
|
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
|
|
178
179
|
std::set<llama_token> preserved_tokens;
|
|
179
180
|
|
|
180
|
-
std::vector<llama_logit_bias> logit_bias;
|
|
181
|
+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
182
|
+
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
181
183
|
|
|
182
184
|
// print the parameters into a string
|
|
183
185
|
std::string print() const;
|
|
@@ -217,6 +219,14 @@ struct common_params_vocoder {
|
|
|
217
219
|
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
218
220
|
};
|
|
219
221
|
|
|
222
|
+
struct common_params_diffusion {
|
|
223
|
+
int32_t steps = 64; // number of diffusion steps
|
|
224
|
+
float eps = 1e-3f; // epsilon for timesteps
|
|
225
|
+
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
|
226
|
+
float alg_temp = 0.0f; // algorithm temperature
|
|
227
|
+
bool visual_mode = false; // show progressive diffusion on screen
|
|
228
|
+
};
|
|
229
|
+
|
|
220
230
|
enum common_reasoning_format {
|
|
221
231
|
COMMON_REASONING_FORMAT_NONE,
|
|
222
232
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
@@ -269,6 +279,7 @@ struct common_params {
|
|
|
269
279
|
struct common_params_sampling sampling;
|
|
270
280
|
struct common_params_speculative speculative;
|
|
271
281
|
struct common_params_vocoder vocoder;
|
|
282
|
+
struct common_params_diffusion diffusion;
|
|
272
283
|
|
|
273
284
|
struct common_params_model model;
|
|
274
285
|
|
|
@@ -331,6 +342,7 @@ struct common_params {
|
|
|
331
342
|
bool no_perf = false; // disable performance metrics
|
|
332
343
|
bool ctx_shift = true; // context shift on inifinite text generation
|
|
333
344
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
345
|
+
bool kv_unified = false; // enable unified KV cache
|
|
334
346
|
|
|
335
347
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
336
348
|
bool use_mmap = true; // use mmap for faster loads
|
|
@@ -523,6 +535,7 @@ static bool string_starts_with(const std::string & str,
|
|
|
523
535
|
|
|
524
536
|
// While we wait for C++20's std::string::ends_with...
|
|
525
537
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
|
538
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
|
|
526
539
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
|
527
540
|
|
|
528
541
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
@@ -181,6 +181,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
|
|
181
181
|
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
|
182
182
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
183
183
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
184
|
+
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
185
|
+
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
184
186
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
185
187
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
186
188
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -270,6 +272,7 @@ set(GGML_PUBLIC_HEADERS
|
|
|
270
272
|
include/ggml-rpc.h
|
|
271
273
|
include/ggml-sycl.h
|
|
272
274
|
include/ggml-vulkan.h
|
|
275
|
+
include/ggml-webgpu.h
|
|
273
276
|
include/gguf.h)
|
|
274
277
|
|
|
275
278
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#define GGML_WEBGPU_NAME "WebGPU"
|
|
11
|
+
|
|
12
|
+
// Needed for examples in ggml
|
|
13
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -1297,6 +1297,19 @@ extern "C" {
|
|
|
1297
1297
|
struct ggml_tensor * a,
|
|
1298
1298
|
float s);
|
|
1299
1299
|
|
|
1300
|
+
// x = s * a + b
|
|
1301
|
+
GGML_API struct ggml_tensor * ggml_scale_bias(
|
|
1302
|
+
struct ggml_context * ctx,
|
|
1303
|
+
struct ggml_tensor * a,
|
|
1304
|
+
float s,
|
|
1305
|
+
float b);
|
|
1306
|
+
|
|
1307
|
+
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
|
|
1308
|
+
struct ggml_context * ctx,
|
|
1309
|
+
struct ggml_tensor * a,
|
|
1310
|
+
float s,
|
|
1311
|
+
float b);
|
|
1312
|
+
|
|
1300
1313
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
1301
1314
|
GGML_API struct ggml_tensor * ggml_set(
|
|
1302
1315
|
struct ggml_context * ctx,
|