@fugood/llama.node 0.3.9 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.js +2 -2
- package/lib/binding.ts +47 -8
- package/lib/index.js +21 -1
- package/lib/index.ts +31 -1
- package/package.json +12 -3
- package/src/LlamaCompletionWorker.cpp +33 -6
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +336 -28
- package/src/LlamaContext.h +2 -0
- package/src/common.hpp +19 -2
- package/src/llama.cpp/.github/workflows/build.yml +289 -107
- package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
- package/src/llama.cpp/.github/workflows/docker.yml +2 -1
- package/src/llama.cpp/.github/workflows/server.yml +25 -2
- package/src/llama.cpp/CMakeLists.txt +10 -19
- package/src/llama.cpp/cmake/build-info.cmake +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +32 -0
- package/src/llama.cpp/common/arg.cpp +66 -16
- package/src/llama.cpp/common/chat-template.hpp +515 -0
- package/src/llama.cpp/common/chat.cpp +966 -0
- package/src/llama.cpp/common/chat.hpp +52 -0
- package/src/llama.cpp/common/common.cpp +159 -36
- package/src/llama.cpp/common/common.h +56 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
- package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
- package/src/llama.cpp/common/llguidance.cpp +270 -0
- package/src/llama.cpp/common/log.cpp +1 -10
- package/src/llama.cpp/common/log.h +10 -0
- package/src/llama.cpp/common/minja.hpp +2868 -0
- package/src/llama.cpp/common/sampling.cpp +22 -1
- package/src/llama.cpp/common/sampling.h +3 -0
- package/src/llama.cpp/docs/build.md +54 -9
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
- package/src/llama.cpp/examples/llava/clip.cpp +133 -14
- package/src/llama.cpp/examples/llava/clip.h +2 -0
- package/src/llama.cpp/examples/llava/llava.cpp +22 -8
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
- package/src/llama.cpp/examples/main/main.cpp +26 -25
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
- package/src/llama.cpp/examples/run/run.cpp +224 -69
- package/src/llama.cpp/examples/server/server.cpp +252 -81
- package/src/llama.cpp/examples/server/utils.hpp +73 -21
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
- package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +23 -13
- package/src/llama.cpp/include/llama.h +14 -1
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +7 -2
- package/src/llama.cpp/src/llama-arch.h +3 -1
- package/src/llama.cpp/src/llama-chat.cpp +11 -2
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +86 -6
- package/src/llama.cpp/src/llama-grammar.h +22 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +76 -6
- package/src/llama.cpp/src/llama-sampling.cpp +47 -4
- package/src/llama.cpp/src/llama-vocab.cpp +10 -4
- package/src/llama.cpp/src/llama.cpp +181 -123
- package/src/llama.cpp/tests/CMakeLists.txt +4 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
- package/src/llama.cpp/tests/test-chat.cpp +607 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
|
@@ -5,10 +5,6 @@
|
|
|
5
5
|
#include "llama.h"
|
|
6
6
|
#include "common/base64.hpp"
|
|
7
7
|
|
|
8
|
-
#ifndef NDEBUG
|
|
9
|
-
// crash the server in debug mode, otherwise send an http 500 error
|
|
10
|
-
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
|
11
|
-
#endif
|
|
12
8
|
// increase max payload length to allow use of larger context size
|
|
13
9
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
14
10
|
#include "httplib.h"
|
|
@@ -16,6 +12,9 @@
|
|
|
16
12
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
17
13
|
#define JSON_ASSERT GGML_ASSERT
|
|
18
14
|
#include "json.hpp"
|
|
15
|
+
#include "minja.hpp"
|
|
16
|
+
#include "chat.hpp"
|
|
17
|
+
#include "chat-template.hpp"
|
|
19
18
|
|
|
20
19
|
#include <random>
|
|
21
20
|
#include <sstream>
|
|
@@ -349,7 +348,7 @@ static llama_tokens format_infill(
|
|
|
349
348
|
}
|
|
350
349
|
|
|
351
350
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
352
|
-
inline std::string format_chat(const
|
|
351
|
+
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
|
353
352
|
std::vector<common_chat_msg> chat;
|
|
354
353
|
|
|
355
354
|
for (size_t i = 0; i < messages.size(); ++i) {
|
|
@@ -374,10 +373,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|
|
374
373
|
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
|
375
374
|
}
|
|
376
375
|
|
|
377
|
-
chat.push_back({role, content});
|
|
376
|
+
chat.push_back({role, content, /* tool_calls= */ {}});
|
|
378
377
|
}
|
|
379
378
|
|
|
380
|
-
const auto formatted_chat = common_chat_apply_template(
|
|
379
|
+
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
|
381
380
|
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
|
382
381
|
|
|
383
382
|
return formatted_chat;
|
|
@@ -576,14 +575,32 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
576
575
|
return llama_params;
|
|
577
576
|
}
|
|
578
577
|
|
|
579
|
-
static json
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
578
|
+
static json oaicompat_completion_params_parse(
|
|
579
|
+
const json & body, /* openai api json semantics */
|
|
580
|
+
bool use_jinja,
|
|
581
|
+
const common_chat_templates & chat_templates)
|
|
582
|
+
{
|
|
583
583
|
json llama_params;
|
|
584
|
+
const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
|
|
585
|
+
? *chat_templates.template_tool_use
|
|
586
|
+
: *chat_templates.template_default;
|
|
584
587
|
|
|
585
|
-
|
|
586
|
-
|
|
588
|
+
auto tools = json_value(body, "tools", json());
|
|
589
|
+
auto stream = json_value(body, "stream", false);
|
|
590
|
+
|
|
591
|
+
if (tools.is_array() && !tools.empty()) {
|
|
592
|
+
if (stream) {
|
|
593
|
+
throw std::runtime_error("Cannot use tools with stream");
|
|
594
|
+
}
|
|
595
|
+
if (!use_jinja) {
|
|
596
|
+
throw std::runtime_error("tools param requires --jinja flag");
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
if (!use_jinja) {
|
|
600
|
+
if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
|
|
601
|
+
throw std::runtime_error("Unsupported param: tool_choice");
|
|
602
|
+
}
|
|
603
|
+
}
|
|
587
604
|
|
|
588
605
|
// Handle "stop" field
|
|
589
606
|
if (body.contains("stop") && body.at("stop").is_string()) {
|
|
@@ -606,6 +623,49 @@ static json oaicompat_chat_completion_params_parse(
|
|
|
606
623
|
}
|
|
607
624
|
}
|
|
608
625
|
|
|
626
|
+
// Apply chat template to the list of messages
|
|
627
|
+
if (use_jinja) {
|
|
628
|
+
auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
|
|
629
|
+
if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
|
|
630
|
+
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
|
631
|
+
}
|
|
632
|
+
if (tool_choice != "none" && llama_params.contains("grammar")) {
|
|
633
|
+
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
634
|
+
}
|
|
635
|
+
common_chat_inputs inputs;
|
|
636
|
+
inputs.messages = body.at("messages");
|
|
637
|
+
inputs.tools = tools;
|
|
638
|
+
inputs.tool_choice = tool_choice;
|
|
639
|
+
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
|
640
|
+
if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
|
|
641
|
+
LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
|
|
642
|
+
inputs.parallel_tool_calls = false;
|
|
643
|
+
}
|
|
644
|
+
inputs.stream = stream;
|
|
645
|
+
// TODO: support mixing schema w/ tools beyond generic format.
|
|
646
|
+
inputs.json_schema = json_value(llama_params, "json_schema", json());
|
|
647
|
+
auto chat_params = common_chat_params_init(tmpl, inputs);
|
|
648
|
+
|
|
649
|
+
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
650
|
+
llama_params["prompt"] = chat_params.prompt;
|
|
651
|
+
llama_params["grammar"] = chat_params.grammar;
|
|
652
|
+
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
653
|
+
auto grammar_triggers = json::array();
|
|
654
|
+
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
655
|
+
grammar_triggers.push_back({
|
|
656
|
+
{"word", trigger.word},
|
|
657
|
+
{"at_start", trigger.at_start},
|
|
658
|
+
});
|
|
659
|
+
}
|
|
660
|
+
llama_params["grammar_triggers"] = grammar_triggers;
|
|
661
|
+
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
662
|
+
for (const auto & stop : chat_params.additional_stops) {
|
|
663
|
+
llama_params["stop"].push_back(stop);
|
|
664
|
+
}
|
|
665
|
+
} else {
|
|
666
|
+
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
|
667
|
+
}
|
|
668
|
+
|
|
609
669
|
// Handle "n" field
|
|
610
670
|
int n_choices = json_value(body, "n", 1);
|
|
611
671
|
if (n_choices != 1) {
|
|
@@ -620,14 +680,6 @@ static json oaicompat_chat_completion_params_parse(
|
|
|
620
680
|
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
|
621
681
|
}
|
|
622
682
|
|
|
623
|
-
// Params supported by OAI but unsupported by llama.cpp
|
|
624
|
-
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
|
625
|
-
for (const auto & param : unsupported_params) {
|
|
626
|
-
if (body.contains(param)) {
|
|
627
|
-
throw std::runtime_error("Unsupported param: " + param);
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
|
|
631
683
|
// Copy remaining properties to llama_params
|
|
632
684
|
// This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
|
|
633
685
|
// See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
|
|
@@ -95,13 +95,15 @@ int main(int argc, char ** argv) {
|
|
|
95
95
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
|
96
96
|
|
|
97
97
|
// helper function to evaluate a prompt and generate a response
|
|
98
|
-
auto generate = [&](const std::string & prompt
|
|
98
|
+
auto generate = [&](const std::string & prompt) {
|
|
99
99
|
std::string response;
|
|
100
100
|
|
|
101
|
+
const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
|
|
102
|
+
|
|
101
103
|
// tokenize the prompt
|
|
102
104
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
103
105
|
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
|
104
|
-
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(),
|
|
106
|
+
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
|
|
105
107
|
GGML_ABORT("failed to tokenize the prompt\n");
|
|
106
108
|
}
|
|
107
109
|
|
|
@@ -161,7 +163,7 @@ int main(int argc, char ** argv) {
|
|
|
161
163
|
break;
|
|
162
164
|
}
|
|
163
165
|
|
|
164
|
-
const char * tmpl = llama_model_chat_template(model);
|
|
166
|
+
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
|
165
167
|
|
|
166
168
|
// add the user input to the message list and format it
|
|
167
169
|
messages.push_back({"user", strdup(user.c_str())});
|
|
@@ -180,7 +182,7 @@ int main(int argc, char ** argv) {
|
|
|
180
182
|
|
|
181
183
|
// generate a response
|
|
182
184
|
printf("\033[33m");
|
|
183
|
-
std::string response = generate(prompt
|
|
185
|
+
std::string response = generate(prompt);
|
|
184
186
|
printf("\n\033[0m");
|
|
185
187
|
|
|
186
188
|
// add the response to the messages
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.12)
|
|
2
|
+
project(llama-simple-cmake-pkg)
|
|
3
|
+
|
|
4
|
+
set(TARGET llama-simple-cmake-pkg)
|
|
5
|
+
|
|
6
|
+
find_package(Llama REQUIRED)
|
|
7
|
+
|
|
8
|
+
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
|
|
9
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
10
|
+
target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
|
|
11
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -58,7 +58,8 @@ else()
|
|
|
58
58
|
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
|
59
59
|
endif()
|
|
60
60
|
|
|
61
|
-
if (CMAKE_CROSSCOMPILING)
|
|
61
|
+
if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
|
|
62
|
+
message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
|
|
62
63
|
set(GGML_NATIVE_DEFAULT OFF)
|
|
63
64
|
else()
|
|
64
65
|
set(GGML_NATIVE_DEFAULT ON)
|
|
@@ -153,6 +154,8 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
|
|
|
153
154
|
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
|
154
155
|
|
|
155
156
|
option(GGML_HIP "ggml: use HIP" OFF)
|
|
157
|
+
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
158
|
+
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
156
159
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
157
160
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
158
161
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
@@ -264,3 +267,77 @@ if (GGML_STANDALONE)
|
|
|
264
267
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
|
265
268
|
DESTINATION share/pkgconfig)
|
|
266
269
|
endif()
|
|
270
|
+
|
|
271
|
+
#
|
|
272
|
+
# Create CMake package
|
|
273
|
+
#
|
|
274
|
+
|
|
275
|
+
# Generate version info based on git commit.
|
|
276
|
+
|
|
277
|
+
if(NOT DEFINED GGML_BUILD_NUMBER)
|
|
278
|
+
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
|
|
279
|
+
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
|
|
280
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
281
|
+
OUTPUT_VARIABLE GGML_BUILD_NUMBER
|
|
282
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if(GGML_BUILD_NUMBER EQUAL 1)
|
|
286
|
+
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
|
|
287
|
+
endif()
|
|
288
|
+
|
|
289
|
+
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
|
290
|
+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
|
291
|
+
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
|
292
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
293
|
+
)
|
|
294
|
+
endif()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# Capture variables prefixed with GGML_.
|
|
298
|
+
|
|
299
|
+
set(variable_set_statements
|
|
300
|
+
"
|
|
301
|
+
####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
|
|
302
|
+
####### Any changes to this file will be overwritten by the next CMake run #######
|
|
303
|
+
|
|
304
|
+
")
|
|
305
|
+
|
|
306
|
+
set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
|
|
307
|
+
|
|
308
|
+
get_cmake_property(all_variables VARIABLES)
|
|
309
|
+
foreach(variable_name IN LISTS all_variables)
|
|
310
|
+
if(variable_name MATCHES "^GGML_")
|
|
311
|
+
string(REPLACE ";" "\\;"
|
|
312
|
+
variable_value "${${variable_name}}")
|
|
313
|
+
|
|
314
|
+
set(variable_set_statements
|
|
315
|
+
"${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
|
|
316
|
+
endif()
|
|
317
|
+
endforeach()
|
|
318
|
+
|
|
319
|
+
set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
|
|
320
|
+
|
|
321
|
+
# Create the CMake package and set install location.
|
|
322
|
+
|
|
323
|
+
set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
|
|
324
|
+
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
|
325
|
+
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
|
326
|
+
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
|
327
|
+
|
|
328
|
+
configure_package_config_file(
|
|
329
|
+
${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
|
|
330
|
+
${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
|
331
|
+
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
|
|
332
|
+
PATH_VARS GGML_INCLUDE_INSTALL_DIR
|
|
333
|
+
GGML_LIB_INSTALL_DIR
|
|
334
|
+
GGML_BIN_INSTALL_DIR)
|
|
335
|
+
|
|
336
|
+
write_basic_package_version_file(
|
|
337
|
+
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
|
338
|
+
VERSION ${GGML_INSTALL_VERSION}
|
|
339
|
+
COMPATIBILITY SameMajorVersion)
|
|
340
|
+
|
|
341
|
+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
|
342
|
+
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
|
343
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
|
@@ -93,12 +93,18 @@ endif()
|
|
|
93
93
|
|
|
94
94
|
if (GGML_CCACHE)
|
|
95
95
|
find_program(GGML_CCACHE_FOUND ccache)
|
|
96
|
+
find_program(GGML_SCCACHE_FOUND sccache)
|
|
96
97
|
|
|
97
|
-
if (GGML_CCACHE_FOUND)
|
|
98
|
+
if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
|
|
99
|
+
if(GGML_CCACHE_FOUND)
|
|
100
|
+
set(GGML_CCACHE_VARIANT ccache)
|
|
101
|
+
else()
|
|
102
|
+
set(GGML_CCACHE_VARIANT sccache)
|
|
103
|
+
endif()
|
|
98
104
|
# TODO: should not be set globally
|
|
99
|
-
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE
|
|
105
|
+
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
|
|
100
106
|
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
|
101
|
-
message(STATUS "
|
|
107
|
+
message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
|
|
102
108
|
else()
|
|
103
109
|
message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
|
|
104
110
|
endif ()
|
|
@@ -250,6 +256,17 @@ function(ggml_add_backend_library backend)
|
|
|
250
256
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
|
|
251
257
|
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
|
|
252
258
|
endif()
|
|
259
|
+
|
|
260
|
+
if(NOT GGML_AVAILABLE_BACKENDS)
|
|
261
|
+
set(GGML_AVAILABLE_BACKENDS "${backend}"
|
|
262
|
+
CACHE INTERNAL "List of backends for cmake package")
|
|
263
|
+
else()
|
|
264
|
+
list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
|
|
265
|
+
if(has_backend EQUAL -1)
|
|
266
|
+
set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
|
|
267
|
+
CACHE INTERNAL "List of backends for cmake package")
|
|
268
|
+
endif()
|
|
269
|
+
endif()
|
|
253
270
|
endfunction()
|
|
254
271
|
|
|
255
272
|
function(ggml_add_backend backend)
|
|
@@ -297,7 +314,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
297
314
|
# MSVC doesn't support AMX
|
|
298
315
|
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
299
316
|
endif()
|
|
300
|
-
|
|
317
|
+
elseif (GGML_CPU)
|
|
301
318
|
ggml_add_cpu_backend_variant_impl("")
|
|
302
319
|
endif()
|
|
303
320
|
|
|
@@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
|
989
989
|
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
|
990
990
|
}
|
|
991
991
|
|
|
992
|
-
if (this_size > max_size) {
|
|
993
|
-
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
|
994
|
-
__func__, t->name,
|
|
995
|
-
ggml_backend_buft_name(buft),
|
|
996
|
-
this_size, max_size);
|
|
997
|
-
for (size_t i = 0; i < n_buffers; i++) {
|
|
998
|
-
ggml_backend_buffer_free(buffers[i]);
|
|
999
|
-
}
|
|
1000
|
-
free(buffers);
|
|
1001
|
-
return NULL;
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
if ((cur_buf_size + this_size) > max_size) {
|
|
992
|
+
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
|
1005
993
|
// allocate tensors in the current buffer
|
|
1006
994
|
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
|
1007
995
|
return NULL;
|
|
@@ -297,6 +297,90 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
|
|
|
297
297
|
static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
|
|
298
298
|
#endif
|
|
299
299
|
|
|
300
|
+
#if defined(__loongarch_sx)
|
|
301
|
+
|
|
302
|
+
static __m128i lsx_packs_w(__m128i a, __m128i b) {
|
|
303
|
+
__m128i tmp, tmp1;
|
|
304
|
+
tmp = __lsx_vsat_w(a, 15);
|
|
305
|
+
tmp1 = __lsx_vsat_w(b, 15);
|
|
306
|
+
return __lsx_vpickev_h(tmp1, tmp);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
static __m128i lsx_packs_h(__m128i a, __m128i b) {
|
|
310
|
+
__m128i tmp, tmp1;
|
|
311
|
+
tmp = __lsx_vsat_h(a, 7);
|
|
312
|
+
tmp1 = __lsx_vsat_h(b, 7);
|
|
313
|
+
return __lsx_vpickev_b(tmp1, tmp);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
static __m128i lsx_packus_h(__m128i a, __m128i b) {
|
|
317
|
+
__m128i tmp, tmp1;
|
|
318
|
+
tmp = __lsx_vsat_hu(a, 7);
|
|
319
|
+
tmp1 = __lsx_vsat_hu(b, 7);
|
|
320
|
+
return __lsx_vpickev_b(tmp1, tmp);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
|
|
324
|
+
__m128i tmp1, tmp2;
|
|
325
|
+
tmp1 = __lsx_vmulwev_h_b(a, b);
|
|
326
|
+
tmp2 = __lsx_vmulwod_h_b(a, b);
|
|
327
|
+
return __lsx_vsadd_h(tmp1, tmp2);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
static __m128i lsx_madd_h(__m128i a, __m128i b) {
|
|
331
|
+
__m128i tmp1, tmp2;
|
|
332
|
+
tmp1 = __lsx_vmulwev_w_h(a, b);
|
|
333
|
+
tmp2 = __lsx_vmulwod_w_h(a, b);
|
|
334
|
+
return __lsx_vadd_w(tmp1, tmp2);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
|
|
338
|
+
v4i32 __ret = {d, c, b, a};
|
|
339
|
+
return (__m128i)__ret;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
|
|
343
|
+
__m128i mask_f, zero, tmp0, tmp2, mask;
|
|
344
|
+
int f = 0x8f;
|
|
345
|
+
mask_f = __lsx_vreplgr2vr_b(f);
|
|
346
|
+
zero = __lsx_vldi(0);
|
|
347
|
+
tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
|
|
348
|
+
tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
|
|
349
|
+
mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
|
|
350
|
+
tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
|
|
351
|
+
return __lsx_vshuf_b(a, zero, tmp2);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
static __m128i lsx_hadd_h(__m128i a, __m128i b) {
|
|
355
|
+
__m128i tmp1 = __lsx_vpickev_h(b, a);
|
|
356
|
+
__m128i tmp2 = __lsx_vpickod_h(b, a);
|
|
357
|
+
return __lsx_vadd_h(tmp1, tmp2);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
static __m128i lsx_hadd_w(__m128i a, __m128i b) {
|
|
361
|
+
__m128i tmp1 = __lsx_vpickev_w(b, a);
|
|
362
|
+
__m128i tmp2 = __lsx_vpickod_w(b, a);
|
|
363
|
+
return __lsx_vadd_w(tmp1, tmp2);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
static __m128 lsx_hadd_s(__m128 a, __m128 b) {
|
|
367
|
+
__m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
|
|
368
|
+
__m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
|
|
369
|
+
|
|
370
|
+
return __lsx_vfadd_s(tmp1, tmp2);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
|
374
|
+
__m128 res_0 =lsx_hadd_s(a, b);
|
|
375
|
+
__m128 res_1 =lsx_hadd_s(c, d);
|
|
376
|
+
__m128 res =lsx_hadd_s(res_0, res_1);
|
|
377
|
+
res =lsx_hadd_s(res, res);
|
|
378
|
+
res =lsx_hadd_s(res, res);
|
|
379
|
+
|
|
380
|
+
return ((v4f32)res)[0];
|
|
381
|
+
}
|
|
382
|
+
#endif
|
|
383
|
+
|
|
300
384
|
#if defined(__loongarch_asx)
|
|
301
385
|
|
|
302
386
|
#ifdef __clang__
|
|
@@ -395,11 +479,6 @@ static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1
|
|
|
395
479
|
return (__m256i)__ret;
|
|
396
480
|
}
|
|
397
481
|
|
|
398
|
-
static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
|
|
399
|
-
v4i32 __ret = {d, c, b, a};
|
|
400
|
-
return (__m128i)__ret;
|
|
401
|
-
}
|
|
402
|
-
|
|
403
482
|
static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
|
|
404
483
|
v4i64 __ret = {d, c, b, a};
|
|
405
484
|
return (__m256i)__ret;
|
|
@@ -409,18 +488,6 @@ static __m256i lasx_insertf128( __m128i x, __m128i y) {
|
|
|
409
488
|
return lasx_set_q(x, y);
|
|
410
489
|
}
|
|
411
490
|
|
|
412
|
-
static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
|
|
413
|
-
__m128i mask_f, zero, tmp0, tmp2, mask;
|
|
414
|
-
int f = 0x8f;
|
|
415
|
-
mask_f = __lsx_vreplgr2vr_b(f);
|
|
416
|
-
zero = __lsx_vldi(0);
|
|
417
|
-
tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
|
|
418
|
-
tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
|
|
419
|
-
mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
|
|
420
|
-
tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
|
|
421
|
-
return __lsx_vshuf_b(a, zero, tmp2);
|
|
422
|
-
}
|
|
423
|
-
|
|
424
491
|
static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
|
|
425
492
|
__m256i mask_f, zero, tmp0, tmp2, mask;
|
|
426
493
|
int f = 0x8f;
|
|
@@ -482,25 +549,6 @@ static __m128 lasx_extractf128( __m256 a, int pos) {
|
|
|
482
549
|
return ret;
|
|
483
550
|
}
|
|
484
551
|
|
|
485
|
-
static __m128i lsx_hadd_h(__m128i a, __m128i b) {
|
|
486
|
-
__m128i tmp1 = __lsx_vpickev_h(b, a);
|
|
487
|
-
__m128i tmp2 = __lsx_vpickod_h(b, a);
|
|
488
|
-
return __lsx_vadd_h(tmp1, tmp2);
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
static __m128i lsx_hadd_w(__m128i a, __m128i b) {
|
|
492
|
-
__m128i tmp1 = __lsx_vpickev_w(b, a);
|
|
493
|
-
__m128i tmp2 = __lsx_vpickod_w(b, a);
|
|
494
|
-
return __lsx_vadd_w(tmp1, tmp2);
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
static __m128 lsx_hadd_s(__m128 a, __m128 b) {
|
|
498
|
-
__m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
|
|
499
|
-
__m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
|
|
500
|
-
|
|
501
|
-
return __lsx_vfadd_s(tmp1, tmp2);
|
|
502
|
-
}
|
|
503
|
-
|
|
504
552
|
static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
|
|
505
553
|
__m256i tmp1, tmp2;
|
|
506
554
|
tmp1 = __lasx_xvmulwev_h_b(a, b);
|
|
@@ -529,42 +577,6 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
|
|
|
529
577
|
return __lasx_xvpickev_b(tmp1, tmp);
|
|
530
578
|
}
|
|
531
579
|
|
|
532
|
-
static __m128i lsx_packs_w(__m128i a, __m128i b) {
|
|
533
|
-
__m128i tmp, tmp1;
|
|
534
|
-
tmp = __lsx_vsat_w(a, 15);
|
|
535
|
-
tmp1 = __lsx_vsat_w(b, 15);
|
|
536
|
-
return __lsx_vpickev_h(tmp1, tmp);
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
static __m128i lsx_packs_h(__m128i a, __m128i b) {
|
|
540
|
-
__m128i tmp, tmp1;
|
|
541
|
-
tmp = __lsx_vsat_h(a, 7);
|
|
542
|
-
tmp1 = __lsx_vsat_h(b, 7);
|
|
543
|
-
return __lsx_vpickev_b(tmp1, tmp);
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
static __m128i lsx_packus_h(__m128i a, __m128i b) {
|
|
547
|
-
__m128i tmp, tmp1;
|
|
548
|
-
tmp = __lsx_vsat_hu(a, 7);
|
|
549
|
-
tmp1 = __lsx_vsat_hu(b, 7);
|
|
550
|
-
return __lsx_vpickev_b(tmp1, tmp);
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
|
|
555
|
-
__m128i tmp1, tmp2;
|
|
556
|
-
tmp1 = __lsx_vmulwev_h_b(a, b);
|
|
557
|
-
tmp2 = __lsx_vmulwod_h_b(a, b);
|
|
558
|
-
return __lsx_vsadd_h(tmp1, tmp2);
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
static __m128i lsx_madd_h(__m128i a, __m128i b) {
|
|
562
|
-
__m128i tmp1, tmp2;
|
|
563
|
-
tmp1 = __lsx_vmulwev_w_h(a, b);
|
|
564
|
-
tmp2 = __lsx_vmulwod_w_h(a, b);
|
|
565
|
-
return __lsx_vadd_w(tmp1, tmp2);
|
|
566
|
-
}
|
|
567
|
-
|
|
568
580
|
// multiply int8_t, add results pairwise twice
|
|
569
581
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
570
582
|
// Get absolute values of x vectors
|
|
@@ -2232,21 +2244,22 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2232
2244
|
}
|
|
2233
2245
|
|
|
2234
2246
|
sumf = hsum_float_8(acc);
|
|
2247
|
+
|
|
2235
2248
|
#elif defined(__loongarch_sx)
|
|
2236
2249
|
// set constants
|
|
2237
2250
|
const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
|
|
2238
2251
|
const __m128i off = __lsx_vreplgr2vr_b(8);
|
|
2239
2252
|
|
|
2240
2253
|
// Initialize accumulator with zeros
|
|
2241
|
-
__m128 acc_0 = __lsx_vldi(0);
|
|
2242
|
-
__m128 acc_1 = __lsx_vldi(0);
|
|
2243
|
-
__m128 acc_2 = __lsx_vldi(0);
|
|
2244
|
-
__m128 acc_3 = __lsx_vldi(0);
|
|
2254
|
+
__m128 acc_0 = (__m128)__lsx_vldi(0);
|
|
2255
|
+
__m128 acc_1 = (__m128)__lsx_vldi(0);
|
|
2256
|
+
__m128 acc_2 = (__m128)__lsx_vldi(0);
|
|
2257
|
+
__m128 acc_3 = (__m128)__lsx_vldi(0);
|
|
2245
2258
|
|
|
2246
2259
|
for (; ib + 1 < nb; ib += 2) {
|
|
2247
2260
|
|
|
2248
2261
|
// Compute combined scale for the block 0 and 1
|
|
2249
|
-
const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
|
2262
|
+
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
|
2250
2263
|
|
|
2251
2264
|
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
|
|
2252
2265
|
|
|
@@ -2264,7 +2277,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
|
2264
2277
|
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
|
2265
2278
|
|
|
2266
2279
|
// Compute combined scale for the block 2 and 3
|
|
2267
|
-
const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
|
|
2280
|
+
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
|
|
2268
2281
|
|
|
2269
2282
|
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
|
|
2270
2283
|
|
|
@@ -1302,7 +1302,7 @@ struct ggml_threadpool {
|
|
|
1302
1302
|
// these are atomic as an annotation for thread-sanitizer
|
|
1303
1303
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
1304
1304
|
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
|
1305
|
-
|
|
1305
|
+
atomic_int abort; // Used for aborting processing of a graph
|
|
1306
1306
|
|
|
1307
1307
|
struct ggml_compute_state * workers; // per thread state
|
|
1308
1308
|
int n_threads_max; // number of threads in the pool
|
|
@@ -7883,7 +7883,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
7883
7883
|
|
|
7884
7884
|
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
|
7885
7885
|
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
|
7886
|
-
float * d = (float *) ((char *) dst->data + ( i1*nb1
|
|
7886
|
+
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
|
7887
7887
|
|
|
7888
7888
|
ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
|
|
7889
7889
|
}
|
|
@@ -7892,7 +7892,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
7892
7892
|
|
|
7893
7893
|
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
|
7894
7894
|
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
|
7895
|
-
float * d = (float *) ((char *) dst->data + ( i1*nb1
|
|
7895
|
+
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
|
7896
7896
|
|
|
7897
7897
|
ggml_vec_mad_f32(ne0, d, s0, *s1);
|
|
7898
7898
|
}
|
|
@@ -13851,14 +13851,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
13851
13851
|
/*.threadpool=*/ tp,
|
|
13852
13852
|
};
|
|
13853
13853
|
|
|
13854
|
-
for (int node_n = 0; node_n < cgraph->n_nodes &&
|
|
13854
|
+
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
|
13855
13855
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
13856
13856
|
|
|
13857
13857
|
ggml_compute_forward(¶ms, node);
|
|
13858
13858
|
|
|
13859
13859
|
if (state->ith == 0 && cplan->abort_callback &&
|
|
13860
13860
|
cplan->abort_callback(cplan->abort_callback_data)) {
|
|
13861
|
-
tp->abort
|
|
13861
|
+
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
|
13862
13862
|
tp->ec = GGML_STATUS_ABORTED;
|
|
13863
13863
|
}
|
|
13864
13864
|
|
|
@@ -14031,7 +14031,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
14031
14031
|
threadpool->current_chunk = 0;
|
|
14032
14032
|
threadpool->stop = false;
|
|
14033
14033
|
threadpool->pause = tpp->paused;
|
|
14034
|
-
threadpool->abort =
|
|
14034
|
+
threadpool->abort = -1;
|
|
14035
14035
|
threadpool->workers = NULL;
|
|
14036
14036
|
threadpool->n_threads_max = tpp->n_threads;
|
|
14037
14037
|
threadpool->n_threads_cur = tpp->n_threads;
|
|
@@ -14110,7 +14110,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
14110
14110
|
threadpool->cgraph = cgraph;
|
|
14111
14111
|
threadpool->cplan = cplan;
|
|
14112
14112
|
threadpool->current_chunk = 0;
|
|
14113
|
-
threadpool->abort =
|
|
14113
|
+
threadpool->abort = -1;
|
|
14114
14114
|
threadpool->ec = GGML_STATUS_SUCCESS;
|
|
14115
14115
|
}
|
|
14116
14116
|
|
|
@@ -416,7 +416,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
|
416
416
|
case GGML_OP_IM2COL_BACK:
|
|
417
417
|
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
|
418
418
|
case GGML_OP_OUT_PROD:
|
|
419
|
-
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)
|
|
419
|
+
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
|
|
420
|
+
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
|
420
421
|
default:
|
|
421
422
|
return true;
|
|
422
423
|
}
|
|
@@ -28,7 +28,7 @@ if (CUDAToolkit_FOUND)
|
|
|
28
28
|
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
|
|
29
29
|
|
|
30
30
|
file(GLOB GGML_SOURCES_CUDA "*.cu")
|
|
31
|
-
file(GLOB SRCS "template-instances/fattn-
|
|
31
|
+
file(GLOB SRCS "template-instances/fattn-mma*.cu")
|
|
32
32
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
|
33
33
|
file(GLOB SRCS "template-instances/mmq*.cu")
|
|
34
34
|
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|