@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#include "common.h"
|
|
4
4
|
#include "log.h"
|
|
5
5
|
#include "llama.h"
|
|
6
|
-
#include "
|
|
6
|
+
#include "base64.hpp"
|
|
7
7
|
|
|
8
8
|
// increase max payload length to allow use of larger context size
|
|
9
9
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|
|
58
58
|
|
|
59
59
|
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
|
60
60
|
|
|
61
|
+
// thin wrapper around common_grammar_trigger with (de)serialization functions
|
|
62
|
+
struct server_grammar_trigger {
|
|
63
|
+
common_grammar_trigger value;
|
|
64
|
+
|
|
65
|
+
server_grammar_trigger() = default;
|
|
66
|
+
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
|
|
67
|
+
server_grammar_trigger(const json & in) {
|
|
68
|
+
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
|
|
69
|
+
value.value = in.at("value").get<std::string>();
|
|
70
|
+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
71
|
+
value.token = (llama_token) in.at("token").get<int>();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
json to_json() const {
|
|
76
|
+
json out {
|
|
77
|
+
{"type", (int) value.type},
|
|
78
|
+
{"value", value.value},
|
|
79
|
+
};
|
|
80
|
+
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
|
|
81
|
+
out["token"] = (int) value.token;
|
|
82
|
+
}
|
|
83
|
+
return out;
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
|
|
61
87
|
//
|
|
62
88
|
// tokenizer and input processing utils
|
|
63
89
|
//
|
|
@@ -616,9 +642,31 @@ static json oaicompat_completion_params_parse(
|
|
|
616
642
|
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
617
643
|
}
|
|
618
644
|
|
|
645
|
+
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
|
646
|
+
// for ex. this can be useful to modify the reasoning process in reasoning models
|
|
647
|
+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
|
|
648
|
+
common_chat_msg last_message;
|
|
649
|
+
if (prefill_assistant_message) {
|
|
650
|
+
last_message = inputs.messages.back();
|
|
651
|
+
inputs.messages.pop_back();
|
|
652
|
+
|
|
653
|
+
/* sanity check, max one assistant message at the end of the list */
|
|
654
|
+
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
|
|
655
|
+
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
inputs.extract_reasoning = false;
|
|
659
|
+
inputs.add_generation_prompt = true;
|
|
660
|
+
}
|
|
661
|
+
|
|
619
662
|
// Apply chat template to the list of messages
|
|
620
663
|
auto chat_params = common_chat_templates_apply(tmpls, inputs);
|
|
621
664
|
|
|
665
|
+
/* Append assistant prefilled message */
|
|
666
|
+
if (prefill_assistant_message) {
|
|
667
|
+
chat_params.prompt += last_message.content;
|
|
668
|
+
}
|
|
669
|
+
|
|
622
670
|
llama_params["chat_format"] = static_cast<int>(chat_params.format);
|
|
623
671
|
llama_params["prompt"] = chat_params.prompt;
|
|
624
672
|
if (!chat_params.grammar.empty()) {
|
|
@@ -627,7 +675,8 @@ static json oaicompat_completion_params_parse(
|
|
|
627
675
|
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
|
|
628
676
|
auto grammar_triggers = json::array();
|
|
629
677
|
for (const auto & trigger : chat_params.grammar_triggers) {
|
|
630
|
-
|
|
678
|
+
server_grammar_trigger ct(trigger);
|
|
679
|
+
grammar_triggers.push_back(ct.to_json());
|
|
631
680
|
}
|
|
632
681
|
llama_params["grammar_triggers"] = grammar_triggers;
|
|
633
682
|
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
|
|
@@ -8,10 +8,10 @@ cd build
|
|
|
8
8
|
source /opt/intel/oneapi/setvars.sh
|
|
9
9
|
|
|
10
10
|
#for FP16
|
|
11
|
-
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
|
11
|
+
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
|
|
12
12
|
|
|
13
13
|
#for FP32
|
|
14
|
-
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
|
14
|
+
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
|
|
15
15
|
|
|
16
16
|
#build example/main
|
|
17
17
|
#cmake --build . --config Release --target main
|
|
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|
|
13
13
|
|
|
14
14
|
:: for FP16
|
|
15
15
|
:: faster for long-prompt inference
|
|
16
|
-
:: cmake -G "MinGW Makefiles" ..
|
|
16
|
+
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
|
17
17
|
|
|
18
18
|
:: for FP32
|
|
19
|
-
cmake -G "Ninja" ..
|
|
19
|
+
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
|
20
20
|
if %errorlevel% neq 0 goto ERROR
|
|
21
21
|
:: build example/main only
|
|
22
22
|
:: make main
|
|
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
|
|
|
577
577
|
|
|
578
578
|
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
|
|
579
579
|
|
|
580
|
-
|
|
581
|
-
params.model = params.vocoder.model;
|
|
582
|
-
params.model_url = params.vocoder.model_url;
|
|
583
|
-
params.hf_repo = params.vocoder.hf_repo;
|
|
584
|
-
params.hf_file = params.vocoder.hf_file;
|
|
585
|
-
|
|
580
|
+
params.model = params.vocoder.model;
|
|
586
581
|
params.embedding = true;
|
|
587
582
|
|
|
588
583
|
common_init_result llama_init_cts = common_init_from_params(params);
|
|
@@ -699,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|
|
699
694
|
const std::string voice_data = audio_data;
|
|
700
695
|
|
|
701
696
|
auto tmp = common_tokenize(vocab, voice_data, false, true);
|
|
702
|
-
|
|
697
|
+
|
|
698
|
+
std::ostringstream tokens_oss;
|
|
703
699
|
for (size_t i = 0; i < tmp.size(); ++i) {
|
|
704
|
-
|
|
700
|
+
tokens_oss << tmp[i] << ", ";
|
|
705
701
|
}
|
|
706
|
-
|
|
702
|
+
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
|
|
703
|
+
|
|
707
704
|
prompt_add(prompt_inp, tmp);
|
|
708
705
|
#else
|
|
709
706
|
prompt_add(prompt_inp, llama_tokens {
|
|
@@ -100,9 +100,14 @@ else()
|
|
|
100
100
|
set(INS_ENB ON)
|
|
101
101
|
endif()
|
|
102
102
|
|
|
103
|
+
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
|
|
104
|
+
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
|
105
|
+
message(DEBUG "INS_ENB : ${INS_ENB}")
|
|
106
|
+
|
|
103
107
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
104
108
|
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
105
109
|
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
|
110
|
+
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
|
106
111
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
107
112
|
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
|
108
113
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
@@ -123,10 +128,12 @@ endif()
|
|
|
123
128
|
option(GGML_LASX "ggml: enable lasx" ON)
|
|
124
129
|
option(GGML_LSX "ggml: enable lsx" ON)
|
|
125
130
|
option(GGML_RVV "ggml: enable rvv" ON)
|
|
131
|
+
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
126
132
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
127
133
|
|
|
128
134
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
129
|
-
set(GGML_CPU_ARM_ARCH
|
|
135
|
+
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
136
|
+
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
130
137
|
|
|
131
138
|
|
|
132
139
|
if (WIN32)
|
|
@@ -164,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
164
171
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
165
172
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
166
173
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
167
|
-
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
168
174
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
169
175
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
170
176
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
find_package(Git)
|
|
2
|
+
|
|
3
|
+
# the commit's SHA1
|
|
4
|
+
execute_process(COMMAND
|
|
5
|
+
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
|
6
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
7
|
+
OUTPUT_VARIABLE GIT_SHA1
|
|
8
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
9
|
+
|
|
10
|
+
# the date of the commit
|
|
11
|
+
execute_process(COMMAND
|
|
12
|
+
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
|
13
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
14
|
+
OUTPUT_VARIABLE GIT_DATE
|
|
15
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
16
|
+
|
|
17
|
+
# the subject of the commit
|
|
18
|
+
execute_process(COMMAND
|
|
19
|
+
"${GIT_EXECUTABLE}" log -1 --format=%s
|
|
20
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
21
|
+
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
|
22
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@@ -133,6 +133,11 @@ extern "C" {
|
|
|
133
133
|
|
|
134
134
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
135
135
|
|
|
136
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
138
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
|
139
|
+
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
|
140
|
+
|
|
136
141
|
#ifdef __cplusplus
|
|
137
142
|
}
|
|
138
143
|
#endif
|
|
@@ -7,6 +7,9 @@
|
|
|
7
7
|
extern "C" {
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
+
#define RPC_PROTO_MAJOR_VERSION 2
|
|
11
|
+
#define RPC_PROTO_MINOR_VERSION 0
|
|
12
|
+
#define RPC_PROTO_PATCH_VERSION 0
|
|
10
13
|
#define GGML_RPC_MAX_SERVERS 16
|
|
11
14
|
|
|
12
15
|
// backend API
|
|
@@ -17,7 +20,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
|
|
|
17
20
|
|
|
18
21
|
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
|
19
22
|
|
|
20
|
-
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
|
|
23
|
+
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
|
|
24
|
+
const char * cache_dir,
|
|
25
|
+
size_t free_mem, size_t total_mem);
|
|
21
26
|
|
|
22
27
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
|
23
28
|
|
|
@@ -393,8 +393,8 @@ extern "C" {
|
|
|
393
393
|
|
|
394
394
|
// precision
|
|
395
395
|
enum ggml_prec {
|
|
396
|
-
GGML_PREC_DEFAULT,
|
|
397
|
-
GGML_PREC_F32,
|
|
396
|
+
GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
|
|
397
|
+
GGML_PREC_F32 = 10,
|
|
398
398
|
};
|
|
399
399
|
|
|
400
400
|
// model file types
|
|
@@ -481,6 +481,7 @@ extern "C" {
|
|
|
481
481
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
482
482
|
GGML_OP_IM2COL,
|
|
483
483
|
GGML_OP_IM2COL_BACK,
|
|
484
|
+
GGML_OP_CONV_2D_DW,
|
|
484
485
|
GGML_OP_CONV_TRANSPOSE_2D,
|
|
485
486
|
GGML_OP_POOL_1D,
|
|
486
487
|
GGML_OP_POOL_2D,
|
|
@@ -507,17 +508,12 @@ extern "C" {
|
|
|
507
508
|
|
|
508
509
|
GGML_OP_UNARY,
|
|
509
510
|
|
|
510
|
-
GGML_OP_MAP_UNARY,
|
|
511
|
-
GGML_OP_MAP_BINARY,
|
|
512
|
-
|
|
513
|
-
GGML_OP_MAP_CUSTOM1_F32,
|
|
514
|
-
GGML_OP_MAP_CUSTOM2_F32,
|
|
515
|
-
GGML_OP_MAP_CUSTOM3_F32,
|
|
516
|
-
|
|
517
511
|
GGML_OP_MAP_CUSTOM1,
|
|
518
512
|
GGML_OP_MAP_CUSTOM2,
|
|
519
513
|
GGML_OP_MAP_CUSTOM3,
|
|
520
514
|
|
|
515
|
+
GGML_OP_CUSTOM,
|
|
516
|
+
|
|
521
517
|
GGML_OP_CROSS_ENTROPY_LOSS,
|
|
522
518
|
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
|
523
519
|
GGML_OP_OPT_STEP_ADAMW,
|
|
@@ -682,6 +678,9 @@ extern "C" {
|
|
|
682
678
|
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
|
683
679
|
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
|
684
680
|
|
|
681
|
+
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
|
682
|
+
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
|
683
|
+
|
|
685
684
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
686
685
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
|
687
686
|
|
|
@@ -1665,7 +1664,7 @@ extern "C" {
|
|
|
1665
1664
|
struct ggml_tensor * a,
|
|
1666
1665
|
struct ggml_tensor * b);
|
|
1667
1666
|
|
|
1668
|
-
// depthwise
|
|
1667
|
+
// depthwise (via im2col and mul_mat)
|
|
1669
1668
|
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
|
1670
1669
|
struct ggml_context * ctx,
|
|
1671
1670
|
struct ggml_tensor * a, // convolution kernel
|
|
@@ -1677,6 +1676,22 @@ extern "C" {
|
|
|
1677
1676
|
int d0, // dilation dimension 0
|
|
1678
1677
|
int d1); // dilation dimension 1
|
|
1679
1678
|
|
|
1679
|
+
// Depthwise 2D convolution
|
|
1680
|
+
// may be faster than ggml_conv_2d_dw, but not available in all backends
|
|
1681
|
+
// a: KW KH 1 C convolution kernel
|
|
1682
|
+
// b: W H C N input data
|
|
1683
|
+
// res: W_out H_out C N
|
|
1684
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
|
|
1685
|
+
struct ggml_context * ctx,
|
|
1686
|
+
struct ggml_tensor * a,
|
|
1687
|
+
struct ggml_tensor * b,
|
|
1688
|
+
int stride0,
|
|
1689
|
+
int stride1,
|
|
1690
|
+
int pad0,
|
|
1691
|
+
int pad1,
|
|
1692
|
+
int dilation0,
|
|
1693
|
+
int dilation1);
|
|
1694
|
+
|
|
1680
1695
|
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
|
1681
1696
|
struct ggml_context * ctx,
|
|
1682
1697
|
struct ggml_tensor * a,
|
|
@@ -1722,24 +1737,29 @@ extern "C" {
|
|
|
1722
1737
|
float p0,
|
|
1723
1738
|
float p1);
|
|
1724
1739
|
|
|
1725
|
-
|
|
1740
|
+
enum ggml_scale_mode {
|
|
1741
|
+
GGML_SCALE_MODE_NEAREST = 0,
|
|
1742
|
+
GGML_SCALE_MODE_BILINEAR = 1,
|
|
1743
|
+
};
|
|
1744
|
+
|
|
1745
|
+
// interpolate
|
|
1726
1746
|
// multiplies ne0 and ne1 by scale factor
|
|
1727
|
-
// used in stable-diffusion
|
|
1728
1747
|
GGML_API struct ggml_tensor * ggml_upscale(
|
|
1729
1748
|
struct ggml_context * ctx,
|
|
1730
1749
|
struct ggml_tensor * a,
|
|
1731
|
-
int scale_factor
|
|
1750
|
+
int scale_factor,
|
|
1751
|
+
enum ggml_scale_mode mode);
|
|
1732
1752
|
|
|
1733
|
-
//
|
|
1734
|
-
//
|
|
1735
|
-
// used in tortoise.cpp
|
|
1753
|
+
// interpolate
|
|
1754
|
+
// interpolate scale to specified dimensions
|
|
1736
1755
|
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
|
1737
1756
|
struct ggml_context * ctx,
|
|
1738
1757
|
struct ggml_tensor * a,
|
|
1739
1758
|
int ne0,
|
|
1740
1759
|
int ne1,
|
|
1741
1760
|
int ne2,
|
|
1742
|
-
int ne3
|
|
1761
|
+
int ne3,
|
|
1762
|
+
enum ggml_scale_mode mode);
|
|
1743
1763
|
|
|
1744
1764
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
|
1745
1765
|
GGML_API struct ggml_tensor * ggml_pad(
|
|
@@ -1791,11 +1811,11 @@ extern "C" {
|
|
|
1791
1811
|
|
|
1792
1812
|
#define GGML_KQ_MASK_PAD 64
|
|
1793
1813
|
|
|
1794
|
-
// q: [
|
|
1795
|
-
// k: [
|
|
1796
|
-
// v: [
|
|
1797
|
-
// mask: [n_kv,
|
|
1798
|
-
// res: [
|
|
1814
|
+
// q: [n_embd_k, n_batch, n_head, 1]
|
|
1815
|
+
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
|
1816
|
+
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
|
1817
|
+
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
|
1818
|
+
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
|
1799
1819
|
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
|
1800
1820
|
struct ggml_context * ctx,
|
|
1801
1821
|
struct ggml_tensor * q,
|
|
@@ -1916,83 +1936,6 @@ extern "C" {
|
|
|
1916
1936
|
|
|
1917
1937
|
// custom operators
|
|
1918
1938
|
|
|
1919
|
-
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
|
1920
|
-
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
|
1921
|
-
|
|
1922
|
-
typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
|
|
1923
|
-
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
|
1924
|
-
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
|
|
1925
|
-
|
|
1926
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
|
1927
|
-
struct ggml_context * ctx,
|
|
1928
|
-
struct ggml_tensor * a,
|
|
1929
|
-
ggml_unary_op_f32_t fun),
|
|
1930
|
-
"use ggml_map_custom1 instead");
|
|
1931
|
-
|
|
1932
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
|
|
1933
|
-
struct ggml_context * ctx,
|
|
1934
|
-
struct ggml_tensor * a,
|
|
1935
|
-
ggml_unary_op_f32_t fun),
|
|
1936
|
-
"use ggml_map_custom1_inplace instead");
|
|
1937
|
-
|
|
1938
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
|
1939
|
-
struct ggml_context * ctx,
|
|
1940
|
-
struct ggml_tensor * a,
|
|
1941
|
-
struct ggml_tensor * b,
|
|
1942
|
-
ggml_binary_op_f32_t fun),
|
|
1943
|
-
"use ggml_map_custom2 instead");
|
|
1944
|
-
|
|
1945
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
1946
|
-
struct ggml_context * ctx,
|
|
1947
|
-
struct ggml_tensor * a,
|
|
1948
|
-
struct ggml_tensor * b,
|
|
1949
|
-
ggml_binary_op_f32_t fun),
|
|
1950
|
-
"use ggml_map_custom2_inplace instead");
|
|
1951
|
-
|
|
1952
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
|
|
1953
|
-
struct ggml_context * ctx,
|
|
1954
|
-
struct ggml_tensor * a,
|
|
1955
|
-
ggml_custom1_op_f32_t fun),
|
|
1956
|
-
"use ggml_map_custom1 instead");
|
|
1957
|
-
|
|
1958
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
|
1959
|
-
struct ggml_context * ctx,
|
|
1960
|
-
struct ggml_tensor * a,
|
|
1961
|
-
ggml_custom1_op_f32_t fun),
|
|
1962
|
-
"use ggml_map_custom1_inplace instead");
|
|
1963
|
-
|
|
1964
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
|
|
1965
|
-
struct ggml_context * ctx,
|
|
1966
|
-
struct ggml_tensor * a,
|
|
1967
|
-
struct ggml_tensor * b,
|
|
1968
|
-
ggml_custom2_op_f32_t fun),
|
|
1969
|
-
"use ggml_map_custom2 instead");
|
|
1970
|
-
|
|
1971
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
|
1972
|
-
struct ggml_context * ctx,
|
|
1973
|
-
struct ggml_tensor * a,
|
|
1974
|
-
struct ggml_tensor * b,
|
|
1975
|
-
ggml_custom2_op_f32_t fun),
|
|
1976
|
-
"use ggml_map_custom2_inplace instead");
|
|
1977
|
-
|
|
1978
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
|
|
1979
|
-
struct ggml_context * ctx,
|
|
1980
|
-
struct ggml_tensor * a,
|
|
1981
|
-
struct ggml_tensor * b,
|
|
1982
|
-
struct ggml_tensor * c,
|
|
1983
|
-
ggml_custom3_op_f32_t fun),
|
|
1984
|
-
"use ggml_map_custom3 instead");
|
|
1985
|
-
|
|
1986
|
-
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
|
1987
|
-
struct ggml_context * ctx,
|
|
1988
|
-
struct ggml_tensor * a,
|
|
1989
|
-
struct ggml_tensor * b,
|
|
1990
|
-
struct ggml_tensor * c,
|
|
1991
|
-
ggml_custom3_op_f32_t fun),
|
|
1992
|
-
"use ggml_map_custom3_inplace instead");
|
|
1993
|
-
|
|
1994
|
-
// custom operators v2
|
|
1995
|
-
|
|
1996
1939
|
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
|
|
1997
1940
|
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
|
1998
1941
|
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
|
@@ -2048,6 +1991,30 @@ extern "C" {
|
|
|
2048
1991
|
int n_tasks,
|
|
2049
1992
|
void * userdata);
|
|
2050
1993
|
|
|
1994
|
+
typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
|
|
1995
|
+
|
|
1996
|
+
GGML_API struct ggml_tensor * ggml_custom_4d(
|
|
1997
|
+
struct ggml_context * ctx,
|
|
1998
|
+
enum ggml_type type,
|
|
1999
|
+
int64_t ne0,
|
|
2000
|
+
int64_t ne1,
|
|
2001
|
+
int64_t ne2,
|
|
2002
|
+
int64_t ne3,
|
|
2003
|
+
struct ggml_tensor ** args,
|
|
2004
|
+
int n_args,
|
|
2005
|
+
ggml_custom_op_t fun,
|
|
2006
|
+
int n_tasks,
|
|
2007
|
+
void * userdata);
|
|
2008
|
+
|
|
2009
|
+
GGML_API struct ggml_tensor * ggml_custom_inplace(
|
|
2010
|
+
struct ggml_context * ctx,
|
|
2011
|
+
struct ggml_tensor * a,
|
|
2012
|
+
struct ggml_tensor ** args,
|
|
2013
|
+
int n_args,
|
|
2014
|
+
ggml_custom_op_t fun,
|
|
2015
|
+
int n_tasks,
|
|
2016
|
+
void * userdata);
|
|
2017
|
+
|
|
2051
2018
|
// loss function
|
|
2052
2019
|
|
|
2053
2020
|
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
|
|
@@ -65,7 +65,7 @@ if (GGML_LTO)
|
|
|
65
65
|
endif()
|
|
66
66
|
endif()
|
|
67
67
|
|
|
68
|
-
if (GGML_CCACHE)
|
|
68
|
+
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
|
|
69
69
|
find_program(GGML_CCACHE_FOUND ccache)
|
|
70
70
|
find_program(GGML_SCCACHE_FOUND sccache)
|
|
71
71
|
|
|
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|
|
267
267
|
set(GGML_CPU_TAG_NAME ${tag_name})
|
|
268
268
|
# other: OPENMP LLAMAFILE CPU_HBM
|
|
269
269
|
foreach (feat NATIVE
|
|
270
|
+
SSE42
|
|
270
271
|
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
|
271
272
|
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
|
272
273
|
AMX_TILE AMX_INT8 AMX_BF16)
|
|
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
286
287
|
if (NOT GGML_BACKEND_DL)
|
|
287
288
|
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
|
288
289
|
endif()
|
|
289
|
-
ggml_add_cpu_backend_variant(
|
|
290
|
-
ggml_add_cpu_backend_variant(
|
|
291
|
-
ggml_add_cpu_backend_variant(
|
|
292
|
-
ggml_add_cpu_backend_variant(
|
|
293
|
-
ggml_add_cpu_backend_variant(
|
|
290
|
+
ggml_add_cpu_backend_variant(x64)
|
|
291
|
+
ggml_add_cpu_backend_variant(sse42 SSE42)
|
|
292
|
+
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
|
293
|
+
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
|
294
|
+
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
|
295
|
+
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
|
296
|
+
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
|
294
297
|
if (NOT MSVC)
|
|
295
298
|
# MSVC doesn't support AMX
|
|
296
|
-
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
299
|
+
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
297
300
|
endif()
|
|
298
301
|
elseif (GGML_CPU)
|
|
299
302
|
ggml_add_cpu_backend_variant_impl("")
|
|
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
|
|
|
51
51
|
${CANN_INSTALL_DIR}/acllib/include
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
add_subdirectory(kernels)
|
|
55
54
|
list(APPEND CANN_LIBRARIES
|
|
56
55
|
ascendcl
|
|
57
56
|
nnopbase
|
|
58
57
|
opapi
|
|
59
58
|
acl_op_compiler
|
|
60
|
-
ascendc_kernels
|
|
61
59
|
)
|
|
62
60
|
|
|
63
61
|
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
|
@@ -41,6 +41,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
|
|
41
41
|
return ACL_INT4;
|
|
42
42
|
case GGML_TYPE_Q8_0:
|
|
43
43
|
return ACL_INT8;
|
|
44
|
+
case GGML_TYPE_I64:
|
|
45
|
+
return ACL_INT64;
|
|
44
46
|
default:
|
|
45
47
|
return ACL_DT_UNDEFINED;
|
|
46
48
|
}
|
|
@@ -54,9 +56,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
|
|
54
56
|
// added.
|
|
55
57
|
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
|
|
56
58
|
|
|
57
|
-
int64_t acl_storage_len = 0;
|
|
58
59
|
if (ne == nullptr) {
|
|
59
|
-
acl_storage_len = ggml_nbytes(tensor);
|
|
60
60
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
61
61
|
acl_ne[i] = tensor->ne[i];
|
|
62
62
|
// The step size of acl is in elements.
|
|
@@ -65,14 +65,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
|
|
65
65
|
} else {
|
|
66
66
|
// With bcast
|
|
67
67
|
for (int i = 0; i < dims; i++) {
|
|
68
|
-
acl_storage_len += (ne[i] - 1) * nb[i];
|
|
69
68
|
acl_ne[i] = ne[i];
|
|
70
69
|
acl_stride[i] = nb[i] / ggml_element_size(tensor);
|
|
71
70
|
}
|
|
72
71
|
}
|
|
73
72
|
|
|
74
|
-
// Reverse ne and stride.
|
|
75
73
|
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
|
|
74
|
+
int64_t acl_storage_len = 1;
|
|
75
|
+
for (int i = 0; i < final_dims; i++) {
|
|
76
|
+
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Reverse ne and stride.
|
|
76
80
|
std::reverse(acl_ne, acl_ne + final_dims);
|
|
77
81
|
std::reverse(acl_stride, acl_stride + final_dims);
|
|
78
82
|
|
|
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
|
|
101
101
|
tmp_stride[i] = nb[i] / type_size;
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
-
|
|
105
|
-
std::reverse(tmp_stride, tmp_stride + dims);
|
|
106
|
-
|
|
107
|
-
int64_t acl_storage_len = 0;
|
|
104
|
+
int64_t acl_storage_len = 1;
|
|
108
105
|
for (int i = 0; i < dims; i++) {
|
|
109
|
-
acl_storage_len += (
|
|
106
|
+
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
|
|
110
107
|
}
|
|
111
108
|
|
|
109
|
+
std::reverse(tmp_ne, tmp_ne + dims);
|
|
110
|
+
std::reverse(tmp_stride, tmp_stride + dims);
|
|
111
|
+
|
|
112
112
|
aclTensor* acl_tensor =
|
|
113
113
|
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
|
114
114
|
format, &acl_storage_len, 1, data_ptr);
|