@fugood/llama.node 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +44 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +104 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
- package/src/llama.cpp/include/llama.h +13 -47
- package/src/llama.cpp/src/llama-arch.cpp +298 -3
- package/src/llama.cpp/src/llama-arch.h +22 -1
- package/src/llama.cpp/src/llama-batch.cpp +103 -71
- package/src/llama.cpp/src/llama-batch.h +31 -18
- package/src/llama.cpp/src/llama-chat.cpp +59 -1
- package/src/llama.cpp/src/llama-chat.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +279 -180
- package/src/llama.cpp/src/llama-graph.h +183 -122
- package/src/llama.cpp/src/llama-hparams.cpp +47 -1
- package/src/llama.cpp/src/llama-hparams.h +12 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +3373 -743
- package/src/llama.cpp/src/llama-model.h +20 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +376 -10
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.
|
|
4
|
+
"version": "1.0.4",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.4",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.4",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.4",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.4",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.4",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.4",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.4",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.4",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.4",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.4",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.4",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.4",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.4"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -120,7 +120,6 @@ endfunction()
|
|
|
120
120
|
|
|
121
121
|
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
|
122
122
|
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|
123
|
-
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|
124
123
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
|
125
124
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
|
126
125
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
|
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
|
|
86
86
|
endif()
|
|
87
87
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
|
88
88
|
include_directories(${CURL_INCLUDE_DIRS})
|
|
89
|
-
|
|
90
|
-
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
|
89
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
|
91
90
|
endif ()
|
|
92
91
|
|
|
93
92
|
if (LLAMA_LLGUIDANCE)
|
|
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
|
|
112
111
|
|
|
113
112
|
ExternalProject_Add(llguidance_ext
|
|
114
113
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
115
|
-
#
|
|
116
|
-
GIT_TAG
|
|
114
|
+
# v1.0.1:
|
|
115
|
+
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
|
117
116
|
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
|
118
117
|
SOURCE_DIR ${LLGUIDANCE_SRC}
|
|
119
118
|
BUILD_IN_SOURCE TRUE
|
|
120
119
|
CONFIGURE_COMMAND ""
|
|
121
|
-
BUILD_COMMAND cargo build --release
|
|
120
|
+
BUILD_COMMAND cargo build --release --package llguidance
|
|
122
121
|
INSTALL_COMMAND ""
|
|
123
122
|
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
124
123
|
UPDATE_COMMAND ""
|
|
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1464
1464
|
params.swa_full = true;
|
|
1465
1465
|
}
|
|
1466
1466
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1467
|
+
add_opt(common_arg(
|
|
1468
|
+
{"--kv-unified", "-kvu"},
|
|
1469
|
+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
1470
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1471
|
+
[](common_params & params) {
|
|
1472
|
+
params.kv_unified = true;
|
|
1473
|
+
}
|
|
1474
|
+
).set_env("LLAMA_ARG_KV_SPLIT"));
|
|
1467
1475
|
add_opt(common_arg(
|
|
1468
1476
|
{"--no-context-shift"},
|
|
1469
1477
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
@@ -2734,6 +2742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2734
2742
|
params.public_path = value;
|
|
2735
2743
|
}
|
|
2736
2744
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
|
2745
|
+
add_opt(common_arg(
|
|
2746
|
+
{"--api-prefix"}, "PREFIX",
|
|
2747
|
+
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
|
2748
|
+
[](common_params & params, const std::string & value) {
|
|
2749
|
+
params.api_prefix = value;
|
|
2750
|
+
}
|
|
2751
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
|
2737
2752
|
add_opt(common_arg(
|
|
2738
2753
|
{"--no-webui"},
|
|
2739
2754
|
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
|
@@ -3416,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3416
3431
|
}
|
|
3417
3432
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3418
3433
|
|
|
3434
|
+
// diffusion parameters
|
|
3435
|
+
add_opt(common_arg(
|
|
3436
|
+
{ "--diffusion-steps" }, "N",
|
|
3437
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3438
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3439
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3440
|
+
add_opt(common_arg(
|
|
3441
|
+
{ "--diffusion-eps" }, "F",
|
|
3442
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3443
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3444
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3445
|
+
add_opt(common_arg(
|
|
3446
|
+
{ "--diffusion-algorithm" }, "N",
|
|
3447
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
|
3448
|
+
params.diffusion.algorithm),
|
|
3449
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3450
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3451
|
+
add_opt(common_arg(
|
|
3452
|
+
{ "--diffusion-alg-temp" }, "F",
|
|
3453
|
+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3454
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3455
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3456
|
+
add_opt(common_arg(
|
|
3457
|
+
{ "--diffusion-visual" },
|
|
3458
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3459
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3460
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3461
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3462
|
+
|
|
3419
3463
|
return ctx_arg;
|
|
3420
3464
|
}
|
|
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
448
448
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
|
449
449
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
450
450
|
}
|
|
451
|
+
|
|
452
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
|
453
|
+
bool has_suffix = string_ends_with(str, suffix);
|
|
454
|
+
if (has_suffix) {
|
|
455
|
+
str = str.substr(0, str.size() - suffix.size());
|
|
456
|
+
}
|
|
457
|
+
return has_suffix;
|
|
458
|
+
}
|
|
459
|
+
|
|
451
460
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
|
452
461
|
if (!str.empty() && !stop.empty()) {
|
|
453
462
|
const char text_last_char = str.back();
|
|
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1005
1014
|
params.sampling.ignore_eos = false;
|
|
1006
1015
|
}
|
|
1007
1016
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
}
|
|
1017
|
+
// initialize once
|
|
1018
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1019
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
1020
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
1021
|
+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1014
1022
|
}
|
|
1015
1023
|
}
|
|
1016
1024
|
|
|
1025
|
+
if (params.sampling.ignore_eos) {
|
|
1026
|
+
// add EOG biases to the active set of logit biases
|
|
1027
|
+
params.sampling.logit_bias.insert(
|
|
1028
|
+
params.sampling.logit_bias.end(),
|
|
1029
|
+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1017
1032
|
if (params.sampling.penalty_last_n == -1) {
|
|
1018
1033
|
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1019
1034
|
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
@@ -1158,6 +1173,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1158
1173
|
cparams.no_perf = params.no_perf;
|
|
1159
1174
|
cparams.op_offload = !params.no_op_offload;
|
|
1160
1175
|
cparams.swa_full = params.swa_full;
|
|
1176
|
+
cparams.kv_unified = params.kv_unified;
|
|
1161
1177
|
|
|
1162
1178
|
cparams.type_k = params.cache_type_k;
|
|
1163
1179
|
cparams.type_v = params.cache_type_v;
|
|
@@ -81,6 +81,7 @@ enum llama_example {
|
|
|
81
81
|
LLAMA_EXAMPLE_LOOKUP,
|
|
82
82
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
83
|
LLAMA_EXAMPLE_TTS,
|
|
84
|
+
LLAMA_EXAMPLE_DIFFUSION,
|
|
84
85
|
|
|
85
86
|
LLAMA_EXAMPLE_COUNT,
|
|
86
87
|
};
|
|
@@ -177,7 +178,8 @@ struct common_params_sampling {
|
|
|
177
178
|
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
|
|
178
179
|
std::set<llama_token> preserved_tokens;
|
|
179
180
|
|
|
180
|
-
std::vector<llama_logit_bias> logit_bias;
|
|
181
|
+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
182
|
+
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
181
183
|
|
|
182
184
|
// print the parameters into a string
|
|
183
185
|
std::string print() const;
|
|
@@ -217,6 +219,14 @@ struct common_params_vocoder {
|
|
|
217
219
|
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
218
220
|
};
|
|
219
221
|
|
|
222
|
+
struct common_params_diffusion {
|
|
223
|
+
int32_t steps = 64; // number of diffusion steps
|
|
224
|
+
float eps = 1e-3f; // epsilon for timesteps
|
|
225
|
+
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
|
226
|
+
float alg_temp = 0.0f; // algorithm temperature
|
|
227
|
+
bool visual_mode = false; // show progressive diffusion on screen
|
|
228
|
+
};
|
|
229
|
+
|
|
220
230
|
enum common_reasoning_format {
|
|
221
231
|
COMMON_REASONING_FORMAT_NONE,
|
|
222
232
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
@@ -269,6 +279,7 @@ struct common_params {
|
|
|
269
279
|
struct common_params_sampling sampling;
|
|
270
280
|
struct common_params_speculative speculative;
|
|
271
281
|
struct common_params_vocoder vocoder;
|
|
282
|
+
struct common_params_diffusion diffusion;
|
|
272
283
|
|
|
273
284
|
struct common_params_model model;
|
|
274
285
|
|
|
@@ -331,6 +342,7 @@ struct common_params {
|
|
|
331
342
|
bool no_perf = false; // disable performance metrics
|
|
332
343
|
bool ctx_shift = true; // context shift on inifinite text generation
|
|
333
344
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
345
|
+
bool kv_unified = false; // enable unified KV cache
|
|
334
346
|
|
|
335
347
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
336
348
|
bool use_mmap = true; // use mmap for faster loads
|
|
@@ -371,6 +383,7 @@ struct common_params {
|
|
|
371
383
|
|
|
372
384
|
std::string hostname = "127.0.0.1";
|
|
373
385
|
std::string public_path = ""; // NOLINT
|
|
386
|
+
std::string api_prefix = ""; // NOLINT
|
|
374
387
|
std::string chat_template = ""; // NOLINT
|
|
375
388
|
bool use_jinja = false; // NOLINT
|
|
376
389
|
bool enable_chat_template = true;
|
|
@@ -522,6 +535,7 @@ static bool string_starts_with(const std::string & str,
|
|
|
522
535
|
|
|
523
536
|
// While we wait for C++20's std::string::ends_with...
|
|
524
537
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
|
538
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
|
|
525
539
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
|
526
540
|
|
|
527
541
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
@@ -181,7 +181,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
|
|
181
181
|
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
|
182
182
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
183
183
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
184
|
-
option(
|
|
184
|
+
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
185
|
+
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
185
186
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
186
187
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
187
188
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -266,12 +267,12 @@ set(GGML_PUBLIC_HEADERS
|
|
|
266
267
|
include/ggml-cann.h
|
|
267
268
|
include/ggml-cpp.h
|
|
268
269
|
include/ggml-cuda.h
|
|
269
|
-
include/ggml-kompute.h
|
|
270
270
|
include/ggml-opt.h
|
|
271
271
|
include/ggml-metal.h
|
|
272
272
|
include/ggml-rpc.h
|
|
273
273
|
include/ggml-sycl.h
|
|
274
274
|
include/ggml-vulkan.h
|
|
275
|
+
include/ggml-webgpu.h
|
|
275
276
|
include/gguf.h)
|
|
276
277
|
|
|
277
278
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
@@ -360,6 +361,13 @@ write_basic_package_version_file(
|
|
|
360
361
|
VERSION ${GGML_INSTALL_VERSION}
|
|
361
362
|
COMPATIBILITY SameMajorVersion)
|
|
362
363
|
|
|
364
|
+
target_compile_definitions(ggml-base PRIVATE
|
|
365
|
+
GGML_VERSION="${GGML_INSTALL_VERSION}"
|
|
366
|
+
GGML_COMMIT="${GGML_BUILD_COMMIT}"
|
|
367
|
+
)
|
|
368
|
+
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
|
|
369
|
+
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
|
|
370
|
+
|
|
363
371
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
|
364
372
|
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
|
365
373
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
#define GGML_WEBGPU_NAME "WebGPU"
|
|
11
|
+
|
|
12
|
+
// Needed for examples in ggml
|
|
13
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -314,6 +314,13 @@
|
|
|
314
314
|
extern "C" {
|
|
315
315
|
#endif
|
|
316
316
|
|
|
317
|
+
// Function type used in fatal error callbacks
|
|
318
|
+
typedef void (*ggml_abort_callback_t)(const char * error_message);
|
|
319
|
+
|
|
320
|
+
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
|
321
|
+
// Returns the old callback for chaining
|
|
322
|
+
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
|
|
323
|
+
|
|
317
324
|
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
|
318
325
|
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
|
319
326
|
|
|
@@ -482,12 +489,13 @@ extern "C" {
|
|
|
482
489
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
483
490
|
GGML_OP_IM2COL,
|
|
484
491
|
GGML_OP_IM2COL_BACK,
|
|
492
|
+
GGML_OP_CONV_2D,
|
|
485
493
|
GGML_OP_CONV_2D_DW,
|
|
486
494
|
GGML_OP_CONV_TRANSPOSE_2D,
|
|
487
495
|
GGML_OP_POOL_1D,
|
|
488
496
|
GGML_OP_POOL_2D,
|
|
489
497
|
GGML_OP_POOL_2D_BACK,
|
|
490
|
-
GGML_OP_UPSCALE,
|
|
498
|
+
GGML_OP_UPSCALE,
|
|
491
499
|
GGML_OP_PAD,
|
|
492
500
|
GGML_OP_PAD_REFLECT_1D,
|
|
493
501
|
GGML_OP_ROLL,
|
|
@@ -549,6 +557,8 @@ extern "C" {
|
|
|
549
557
|
GGML_GLU_OP_REGLU,
|
|
550
558
|
GGML_GLU_OP_GEGLU,
|
|
551
559
|
GGML_GLU_OP_SWIGLU,
|
|
560
|
+
GGML_GLU_OP_GEGLU_ERF,
|
|
561
|
+
GGML_GLU_OP_GEGLU_QUICK,
|
|
552
562
|
|
|
553
563
|
GGML_GLU_OP_COUNT,
|
|
554
564
|
};
|
|
@@ -638,6 +648,9 @@ extern "C" {
|
|
|
638
648
|
|
|
639
649
|
// misc
|
|
640
650
|
|
|
651
|
+
GGML_API const char * ggml_version(void);
|
|
652
|
+
GGML_API const char * ggml_commit(void);
|
|
653
|
+
|
|
641
654
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
|
642
655
|
GGML_API int64_t ggml_time_ms(void);
|
|
643
656
|
GGML_API int64_t ggml_time_us(void);
|
|
@@ -1136,6 +1149,22 @@ extern "C" {
|
|
|
1136
1149
|
struct ggml_context * ctx,
|
|
1137
1150
|
struct ggml_tensor * a);
|
|
1138
1151
|
|
|
1152
|
+
GGML_API struct ggml_tensor * ggml_geglu_erf(
|
|
1153
|
+
struct ggml_context * ctx,
|
|
1154
|
+
struct ggml_tensor * a);
|
|
1155
|
+
|
|
1156
|
+
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
|
|
1157
|
+
struct ggml_context * ctx,
|
|
1158
|
+
struct ggml_tensor * a);
|
|
1159
|
+
|
|
1160
|
+
GGML_API struct ggml_tensor * ggml_geglu_quick(
|
|
1161
|
+
struct ggml_context * ctx,
|
|
1162
|
+
struct ggml_tensor * a);
|
|
1163
|
+
|
|
1164
|
+
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
|
|
1165
|
+
struct ggml_context * ctx,
|
|
1166
|
+
struct ggml_tensor * a);
|
|
1167
|
+
|
|
1139
1168
|
// A: n columns, r rows,
|
|
1140
1169
|
// B: n columns, r rows,
|
|
1141
1170
|
GGML_API struct ggml_tensor * ggml_glu_split(
|
|
@@ -1159,6 +1188,16 @@ extern "C" {
|
|
|
1159
1188
|
struct ggml_tensor * a,
|
|
1160
1189
|
struct ggml_tensor * b);
|
|
1161
1190
|
|
|
1191
|
+
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
|
|
1192
|
+
struct ggml_context * ctx,
|
|
1193
|
+
struct ggml_tensor * a,
|
|
1194
|
+
struct ggml_tensor * b);
|
|
1195
|
+
|
|
1196
|
+
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
|
|
1197
|
+
struct ggml_context * ctx,
|
|
1198
|
+
struct ggml_tensor * a,
|
|
1199
|
+
struct ggml_tensor * b);
|
|
1200
|
+
|
|
1162
1201
|
// normalize along rows
|
|
1163
1202
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
1164
1203
|
struct ggml_context * ctx,
|
|
@@ -1258,6 +1297,19 @@ extern "C" {
|
|
|
1258
1297
|
struct ggml_tensor * a,
|
|
1259
1298
|
float s);
|
|
1260
1299
|
|
|
1300
|
+
// x = s * a + b
|
|
1301
|
+
GGML_API struct ggml_tensor * ggml_scale_bias(
|
|
1302
|
+
struct ggml_context * ctx,
|
|
1303
|
+
struct ggml_tensor * a,
|
|
1304
|
+
float s,
|
|
1305
|
+
float b);
|
|
1306
|
+
|
|
1307
|
+
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
|
|
1308
|
+
struct ggml_context * ctx,
|
|
1309
|
+
struct ggml_tensor * a,
|
|
1310
|
+
float s,
|
|
1311
|
+
float b);
|
|
1312
|
+
|
|
1261
1313
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
|
1262
1314
|
GGML_API struct ggml_tensor * ggml_set(
|
|
1263
1315
|
struct ggml_context * ctx,
|
|
@@ -1502,8 +1554,14 @@ extern "C" {
|
|
|
1502
1554
|
struct ggml_context * ctx,
|
|
1503
1555
|
struct ggml_tensor * a);
|
|
1504
1556
|
|
|
1557
|
+
// a [ne0, ne01, ne02, ne03]
|
|
1558
|
+
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
|
|
1559
|
+
//
|
|
1560
|
+
// broadcast:
|
|
1561
|
+
// ne02 % ne12 == 0
|
|
1562
|
+
// ne03 % ne13 == 0
|
|
1563
|
+
//
|
|
1505
1564
|
// fused soft_max(a*scale + mask*(ALiBi slope))
|
|
1506
|
-
// mask is optional
|
|
1507
1565
|
// max_bias = 0.0f for no ALiBi
|
|
1508
1566
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
|
1509
1567
|
struct ggml_context * ctx,
|
|
@@ -1813,6 +1871,17 @@ extern "C" {
|
|
|
1813
1871
|
struct ggml_tensor * b,
|
|
1814
1872
|
int stride);
|
|
1815
1873
|
|
|
1874
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
|
|
1875
|
+
struct ggml_context * ctx,
|
|
1876
|
+
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
|
1877
|
+
struct ggml_tensor * b, // input data [W, H, C, N]
|
|
1878
|
+
int s0, // stride dimension 0
|
|
1879
|
+
int s1, // stride dimension 1
|
|
1880
|
+
int p0, // padding dimension 0
|
|
1881
|
+
int p1, // padding dimension 1
|
|
1882
|
+
int d0, // dilation dimension 0
|
|
1883
|
+
int d1); // dilation dimension 1
|
|
1884
|
+
|
|
1816
1885
|
enum ggml_op_pool {
|
|
1817
1886
|
GGML_OP_POOL_MAX,
|
|
1818
1887
|
GGML_OP_POOL_AVG,
|
|
@@ -1855,6 +1924,12 @@ extern "C" {
|
|
|
1855
1924
|
enum ggml_scale_mode {
|
|
1856
1925
|
GGML_SCALE_MODE_NEAREST = 0,
|
|
1857
1926
|
GGML_SCALE_MODE_BILINEAR = 1,
|
|
1927
|
+
|
|
1928
|
+
GGML_SCALE_MODE_COUNT
|
|
1929
|
+
};
|
|
1930
|
+
|
|
1931
|
+
enum ggml_scale_flag {
|
|
1932
|
+
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
|
|
1858
1933
|
};
|
|
1859
1934
|
|
|
1860
1935
|
// interpolate
|
|
@@ -1867,14 +1942,26 @@ extern "C" {
|
|
|
1867
1942
|
|
|
1868
1943
|
// interpolate
|
|
1869
1944
|
// interpolate scale to specified dimensions
|
|
1870
|
-
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
|
1945
|
+
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
|
|
1871
1946
|
struct ggml_context * ctx,
|
|
1872
1947
|
struct ggml_tensor * a,
|
|
1873
1948
|
int ne0,
|
|
1874
1949
|
int ne1,
|
|
1875
1950
|
int ne2,
|
|
1876
1951
|
int ne3,
|
|
1877
|
-
enum ggml_scale_mode mode)
|
|
1952
|
+
enum ggml_scale_mode mode),
|
|
1953
|
+
"use ggml_interpolate instead");
|
|
1954
|
+
|
|
1955
|
+
// Up- or downsamples the input to the specified size.
|
|
1956
|
+
// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
|
|
1957
|
+
GGML_API struct ggml_tensor * ggml_interpolate(
|
|
1958
|
+
struct ggml_context * ctx,
|
|
1959
|
+
struct ggml_tensor * a,
|
|
1960
|
+
int64_t ne0,
|
|
1961
|
+
int64_t ne1,
|
|
1962
|
+
int64_t ne2,
|
|
1963
|
+
int64_t ne3,
|
|
1964
|
+
uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
|
|
1878
1965
|
|
|
1879
1966
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
|
1880
1967
|
GGML_API struct ggml_tensor * ggml_pad(
|
|
@@ -1937,11 +2024,17 @@ extern "C" {
|
|
|
1937
2024
|
|
|
1938
2025
|
#define GGML_KQ_MASK_PAD 64
|
|
1939
2026
|
|
|
1940
|
-
// q: [n_embd_k, n_batch, n_head,
|
|
1941
|
-
// k: [n_embd_k, n_kv, n_head_kv,
|
|
1942
|
-
// v: [n_embd_v, n_kv, n_head_kv,
|
|
1943
|
-
// mask: [n_kv, n_batch_pad,
|
|
1944
|
-
// res: [n_embd_v, n_head, n_batch,
|
|
2027
|
+
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
|
2028
|
+
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
|
2029
|
+
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
|
2030
|
+
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
2031
|
+
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
|
2032
|
+
//
|
|
2033
|
+
// broadcast:
|
|
2034
|
+
// n_head % n_head_kv == 0
|
|
2035
|
+
// n_head % ne32 == 0
|
|
2036
|
+
// ne3 % ne33 == 0
|
|
2037
|
+
//
|
|
1945
2038
|
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
|
1946
2039
|
struct ggml_context * ctx,
|
|
1947
2040
|
struct ggml_tensor * q,
|
|
@@ -1980,7 +2073,8 @@ extern "C" {
|
|
|
1980
2073
|
struct ggml_tensor * dt,
|
|
1981
2074
|
struct ggml_tensor * A,
|
|
1982
2075
|
struct ggml_tensor * B,
|
|
1983
|
-
struct ggml_tensor * C
|
|
2076
|
+
struct ggml_tensor * C,
|
|
2077
|
+
struct ggml_tensor * ids);
|
|
1984
2078
|
|
|
1985
2079
|
// partition into non-overlapping windows with padding if needed
|
|
1986
2080
|
// example:
|
|
@@ -365,12 +365,12 @@ ggml_add_backend(BLAS)
|
|
|
365
365
|
ggml_add_backend(CANN)
|
|
366
366
|
ggml_add_backend(CUDA)
|
|
367
367
|
ggml_add_backend(HIP)
|
|
368
|
-
ggml_add_backend(Kompute)
|
|
369
368
|
ggml_add_backend(METAL)
|
|
370
369
|
ggml_add_backend(MUSA)
|
|
371
370
|
ggml_add_backend(RPC)
|
|
372
371
|
ggml_add_backend(SYCL)
|
|
373
372
|
ggml_add_backend(Vulkan)
|
|
373
|
+
ggml_add_backend(WebGPU)
|
|
374
374
|
ggml_add_backend(OpenCL)
|
|
375
375
|
|
|
376
376
|
foreach (target ggml-base ggml)
|
|
@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
|
|
|
5
5
|
# build, using set_source_files_properties() to set the arch flags is not possible
|
|
6
6
|
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
|
7
7
|
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
|
8
|
-
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE .
|
|
8
|
+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
|
|
9
9
|
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
|
10
10
|
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
|
11
11
|
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
@@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
589
589
|
if (EMSCRIPTEN)
|
|
590
590
|
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
|
591
591
|
endif()
|
|
592
|
+
|
|
593
|
+
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
|
|
594
|
+
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
|
|
595
|
+
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
|
|
596
|
+
endif()
|
|
592
597
|
endfunction()
|
|
@@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
|
|
1193
1193
|
}
|
|
1194
1194
|
}
|
|
1195
1195
|
|
|
1196
|
-
|
|
1196
|
+
void ggml_compute_forward_mul_mat(
|
|
1197
1197
|
const struct ggml_compute_params * params,
|
|
1198
1198
|
struct ggml_tensor * dst) {
|
|
1199
1199
|
|
|
@@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1866
1866
|
{
|
|
1867
1867
|
ggml_compute_forward_im2col_back_f32(params, tensor);
|
|
1868
1868
|
} break;
|
|
1869
|
+
case GGML_OP_CONV_2D:
|
|
1870
|
+
{
|
|
1871
|
+
ggml_compute_forward_conv_2d(params, tensor);
|
|
1872
|
+
} break;
|
|
1869
1873
|
case GGML_OP_CONV_2D_DW:
|
|
1870
1874
|
{
|
|
1871
1875
|
ggml_compute_forward_conv_2d_dw(params, tensor);
|
|
@@ -2168,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2168
2172
|
case GGML_GLU_OP_REGLU:
|
|
2169
2173
|
case GGML_GLU_OP_GEGLU:
|
|
2170
2174
|
case GGML_GLU_OP_SWIGLU:
|
|
2175
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
2176
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
2171
2177
|
{
|
|
2172
2178
|
n_tasks = n_threads;
|
|
2173
2179
|
} break;
|
|
@@ -2228,6 +2234,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2228
2234
|
} break;
|
|
2229
2235
|
case GGML_OP_IM2COL:
|
|
2230
2236
|
case GGML_OP_IM2COL_BACK:
|
|
2237
|
+
case GGML_OP_CONV_2D:
|
|
2231
2238
|
case GGML_OP_CONV_2D_DW:
|
|
2232
2239
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2233
2240
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
@@ -2746,6 +2753,10 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
2746
2753
|
GGML_ABORT("fatal error");
|
|
2747
2754
|
}
|
|
2748
2755
|
} break;
|
|
2756
|
+
case GGML_OP_CONV_2D:
|
|
2757
|
+
{
|
|
2758
|
+
cur = GGML_IM2COL_WORK_SIZE;
|
|
2759
|
+
} break;
|
|
2749
2760
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
|
2750
2761
|
{
|
|
2751
2762
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|