@fugood/llama.node 1.4.10 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/LlamaContext.cpp +1 -1
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.12",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.12",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.12",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.12",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.12",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.12",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.12",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.12",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.12",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.12",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.12",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.12",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.12",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.12",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.12"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
|
|
|
32
32
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
|
33
33
|
int count = 0;
|
|
34
34
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
35
|
-
index
|
|
35
|
+
index b98ab21ce..2f782837a 100644
|
|
36
36
|
--- a/src/llama.cpp/common/chat.cpp
|
|
37
37
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
38
38
|
@@ -7,9 +7,6 @@
|
|
@@ -62,7 +62,7 @@ index 0a426f447..ab02be247 100644
|
|
|
62
62
|
struct templates_params {
|
|
63
63
|
json messages;
|
|
64
64
|
json tools;
|
|
65
|
-
@@ -
|
|
65
|
+
@@ -752,7 +739,7 @@ static std::string apply(
|
|
66
66
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
67
67
|
}
|
|
68
68
|
// TODO: add flag to control date/time, if only for testing purposes.
|
|
@@ -72,7 +72,7 @@ index 0a426f447..ab02be247 100644
|
|
|
72
72
|
minja::chat_template_options tmpl_opts;
|
|
73
73
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
74
74
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
75
|
-
index
|
|
75
|
+
index 8bd4a325f..333b3301f 100644
|
|
76
76
|
--- a/src/llama.cpp/common/chat.h
|
|
77
77
|
+++ b/src/llama.cpp/common/chat.h
|
|
78
78
|
@@ -10,7 +10,18 @@
|
|
@@ -96,19 +96,19 @@ index 6085510a4..263076ce2 100644
|
|
|
96
96
|
struct common_chat_tool_call {
|
|
97
97
|
std::string name;
|
|
98
98
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
99
|
-
index
|
|
99
|
+
index 79c475612..cf189f8bc 100644
|
|
100
100
|
--- a/src/llama.cpp/common/common.cpp
|
|
101
101
|
+++ b/src/llama.cpp/common/common.cpp
|
|
102
|
-
@@ -
|
|
103
|
-
mparams.
|
|
102
|
+
@@ -1342,6 +1342,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
103
|
+
mparams.devices = params.devices.data();
|
|
104
104
|
}
|
|
105
105
|
|
|
106
106
|
+ mparams.vocab_only = params.vocab_only;
|
|
107
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
107
108
|
mparams.main_gpu = params.main_gpu;
|
|
108
109
|
mparams.split_mode = params.split_mode;
|
|
109
|
-
mparams.tensor_split = params.tensor_split;
|
|
110
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
111
|
+
index f8bc686b6..555ba044a 100644
|
|
112
112
|
--- a/src/llama.cpp/common/common.h
|
|
113
113
|
+++ b/src/llama.cpp/common/common.h
|
|
114
114
|
@@ -307,6 +307,7 @@ struct lr_opt {
|
|
@@ -120,7 +120,7 @@ index 334372073..e912b593a 100644
|
|
|
120
120
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
|
121
121
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
122
122
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
123
|
-
index
|
|
123
|
+
index 7622d0bf4..d2edcfddb 100644
|
|
124
124
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
125
125
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
126
126
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -133,14 +133,13 @@ index 28fb7612e..63f7e1ca1 100644
|
|
|
133
133
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
134
134
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
135
135
|
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
136
|
-
index
|
|
136
|
+
index 13b96d61f..5fa163442 100644
|
|
137
137
|
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
138
138
|
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
139
|
-
@@ -
|
|
139
|
+
@@ -2680,9 +2680,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
140
140
|
GGML_UNUSED(dev);
|
|
141
141
|
}
|
|
142
142
|
|
|
143
|
-
+
|
|
144
143
|
+// ~2GB per session for now
|
|
145
144
|
+#define GGML_HEXAGON_SESSION_MEMORY_DEFAULT (2ULL * 1024 * 1024 * 1024)
|
|
146
145
|
+// Max to 3.5GB
|
|
@@ -149,7 +148,6 @@ index 6a00abacc..9e12459b6 100644
|
|
|
149
148
|
static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
150
149
|
- // ~2GB per session for now
|
|
151
150
|
- *free = 2ULL * 1024 * 1024 * 1024;
|
|
152
|
-
- *total = *free;
|
|
153
151
|
+ const char * str_mem = getenv("GGML_HEXAGON_SESSION_MEMORY");
|
|
154
152
|
+ if (str_mem) {
|
|
155
153
|
+ *free = std::stoull(str_mem);
|
|
@@ -161,32 +159,34 @@ index 6a00abacc..9e12459b6 100644
|
|
|
161
159
|
+ } else {
|
|
162
160
|
+ *free = GGML_HEXAGON_SESSION_MEMORY_DEFAULT;
|
|
163
161
|
+ }
|
|
162
|
+
+
|
|
163
|
+
*total = *free;
|
|
164
164
|
|
|
165
|
-
+ *total = *free;
|
|
166
165
|
GGML_UNUSED(dev);
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
@@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
166
|
+
@@ -2879,10 +2894,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
170
167
|
}
|
|
171
168
|
}
|
|
172
169
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
opt_ndev = 1;
|
|
170
|
+
- if (opt_arch < 75) {
|
|
171
|
+
- opt_ndev = 1;
|
|
176
172
|
- GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
|
177
|
-
|
|
178
|
-
+
|
|
179
|
-
|
|
180
|
-
+
|
|
181
|
-
+
|
|
182
|
-
+
|
|
183
|
-
|
|
184
|
-
|
|
173
|
+
- }
|
|
174
|
+
+ #if defined(__ANDROID__)
|
|
175
|
+
+ if(opt_arch < 75) {
|
|
176
|
+
+ opt_ndev = 1;
|
|
177
|
+
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
|
|
178
|
+
+ }
|
|
179
|
+
+ #else
|
|
180
|
+
+ if(opt_arch < 73) {
|
|
181
|
+
+ opt_ndev = 1;
|
|
182
|
+
+ GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
|
|
183
|
+
+ }
|
|
184
|
+
+ #endif
|
|
185
185
|
|
|
186
186
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
187
187
|
|
|
188
|
-
@@ -
|
|
189
|
-
} catch (std::exception
|
|
188
|
+
@@ -2895,6 +2917,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
189
|
+
} catch (const std::exception & exc) {
|
|
190
190
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
191
191
|
devices[i].context = nullptr;
|
|
192
192
|
+ opt_ndev = i;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -585,7 +585,7 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
|
|
|
585
585
|
for (int i = 0; i < count; i++) {
|
|
586
586
|
char key[256];
|
|
587
587
|
llama_model_meta_key_by_index(model, i, key, sizeof(key));
|
|
588
|
-
char val[
|
|
588
|
+
char val[16384];
|
|
589
589
|
llama_model_meta_val_str_by_index(model, i, val, sizeof(val));
|
|
590
590
|
|
|
591
591
|
metadata.Set(key, val);
|
|
@@ -2017,7 +2017,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2017
2017
|
if (llama_supports_rpc()) {
|
|
2018
2018
|
add_opt(common_arg(
|
|
2019
2019
|
{"--rpc"}, "SERVERS",
|
|
2020
|
-
"comma separated list of RPC servers",
|
|
2020
|
+
"comma separated list of RPC servers (host:port)",
|
|
2021
2021
|
[](common_params & params, const std::string & value) {
|
|
2022
2022
|
add_rpc_devices(value);
|
|
2023
2023
|
GGML_UNUSED(params);
|
|
@@ -2087,7 +2087,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2087
2087
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2088
2088
|
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
|
2089
2089
|
}
|
|
2090
|
-
));
|
|
2090
|
+
).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
|
|
2091
2091
|
add_opt(common_arg(
|
|
2092
2092
|
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
|
2093
2093
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
@@ -2137,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2137
2137
|
}
|
|
2138
2138
|
}
|
|
2139
2139
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2140
|
+
GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
|
2140
2141
|
add_opt(common_arg(
|
|
2141
2142
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2142
|
-
string_format("max. number of layers to store in VRAM (default: %
|
|
2143
|
-
[](common_params & params,
|
|
2144
|
-
|
|
2143
|
+
string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
|
|
2144
|
+
[](common_params & params, const std::string & value) {
|
|
2145
|
+
if (value == "auto") {
|
|
2146
|
+
params.n_gpu_layers = -1;
|
|
2147
|
+
} else if (value == "all") {
|
|
2148
|
+
params.n_gpu_layers = -2;
|
|
2149
|
+
} else {
|
|
2150
|
+
params.n_gpu_layers = std::stoi(value);
|
|
2151
|
+
}
|
|
2145
2152
|
if (!llama_supports_gpu_offload()) {
|
|
2146
2153
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
|
|
2147
2154
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
@@ -3175,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3175
3182
|
params.speculative.devices = parse_device_list(value);
|
|
3176
3183
|
}
|
|
3177
3184
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
3185
|
+
GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
|
|
3178
3186
|
add_opt(common_arg(
|
|
3179
3187
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
3180
|
-
"number of layers to store in VRAM
|
|
3181
|
-
|
|
3182
|
-
|
|
3188
|
+
string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
|
|
3189
|
+
params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
|
|
3190
|
+
[](common_params & params, const std::string & value) {
|
|
3191
|
+
if (value == "auto") {
|
|
3192
|
+
params.speculative.n_gpu_layers = -1;
|
|
3193
|
+
} else if (value == "all") {
|
|
3194
|
+
params.speculative.n_gpu_layers = -2;
|
|
3195
|
+
} else {
|
|
3196
|
+
params.speculative.n_gpu_layers = std::stoi(value);
|
|
3197
|
+
}
|
|
3183
3198
|
if (!llama_supports_gpu_offload()) {
|
|
3184
3199
|
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
|
|
3185
3200
|
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
|
|
@@ -3518,15 +3533,15 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
|
|
|
3518
3533
|
[](common_params &, const std::string &) { /* unused */ }
|
|
3519
3534
|
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
|
|
3520
3535
|
|
|
3536
|
+
args.push_back(common_arg(
|
|
3537
|
+
{"stop-timeout"}, "SECONDS",
|
|
3538
|
+
"in server router mode, force-kill model instance after this many seconds of graceful shutdown",
|
|
3539
|
+
[](common_params &, int) { /* unused */ }
|
|
3540
|
+
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
|
|
3541
|
+
|
|
3521
3542
|
// args.push_back(common_arg(
|
|
3522
3543
|
// {"pin"},
|
|
3523
3544
|
// "in server router mode, do not unload this model if models_max is exceeded",
|
|
3524
3545
|
// [](common_params &) { /* unused */ }
|
|
3525
3546
|
// ).set_preset_only());
|
|
3526
|
-
|
|
3527
|
-
// args.push_back(common_arg(
|
|
3528
|
-
// {"unload-idle-seconds"}, "SECONDS",
|
|
3529
|
-
// "in server router mode, unload models idle for more than this many seconds",
|
|
3530
|
-
// [](common_params &, int) { /* unused */ }
|
|
3531
|
-
// ).set_preset_only());
|
|
3532
3547
|
}
|
|
@@ -1395,6 +1395,14 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
|
1395
1395
|
builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
|
|
1396
1396
|
}
|
|
1397
1397
|
|
|
1398
|
+
static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
|
|
1399
|
+
builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
|
|
1400
|
+
|
|
1401
|
+
// TODO: Tool calling
|
|
1402
|
+
|
|
1403
|
+
builder.add_content(builder.consume_rest());
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1398
1406
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
1399
1407
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1400
1408
|
builder.add_content(builder.consume_rest());
|
|
@@ -1479,6 +1487,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1479
1487
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
|
|
1480
1488
|
common_chat_parse_xiaomi_mimo(builder);
|
|
1481
1489
|
break;
|
|
1490
|
+
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
|
|
1491
|
+
common_chat_parse_solar_open(builder);
|
|
1492
|
+
break;
|
|
1482
1493
|
default:
|
|
1483
1494
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1484
1495
|
}
|
|
@@ -306,7 +306,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
|
|
306
306
|
}
|
|
307
307
|
}
|
|
308
308
|
} else {
|
|
309
|
-
jmsg["content"] =
|
|
309
|
+
jmsg["content"] = "";
|
|
310
310
|
}
|
|
311
311
|
if (!msg.reasoning_content.empty()) {
|
|
312
312
|
jmsg["reasoning_content"] = msg.reasoning_content;
|
|
@@ -367,8 +367,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
|
|
367
367
|
const auto & function = tool.at("function");
|
|
368
368
|
result.push_back({
|
|
369
369
|
/* .name = */ function.at("name"),
|
|
370
|
-
/* .description = */ function.
|
|
371
|
-
/* .parameters = */ function.
|
|
370
|
+
/* .description = */ function.value("description", ""),
|
|
371
|
+
/* .parameters = */ function.value("parameters", json::object()).dump(),
|
|
372
372
|
});
|
|
373
373
|
}
|
|
374
374
|
}
|
|
@@ -656,6 +656,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
656
656
|
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
|
|
657
657
|
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
|
658
658
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
|
659
|
+
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
|
|
659
660
|
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
|
|
660
661
|
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
|
|
661
662
|
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
|
|
@@ -2504,6 +2505,27 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
|
|
2504
2505
|
return data;
|
|
2505
2506
|
}
|
|
2506
2507
|
|
|
2508
|
+
static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2509
|
+
common_chat_params data;
|
|
2510
|
+
|
|
2511
|
+
// TODO: Reasoning effort
|
|
2512
|
+
json additional_context = {};
|
|
2513
|
+
|
|
2514
|
+
data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
|
|
2515
|
+
data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
|
|
2516
|
+
|
|
2517
|
+
data.preserved_tokens = {
|
|
2518
|
+
"<|think|>",
|
|
2519
|
+
"<|content|>",
|
|
2520
|
+
"<|begin|>",
|
|
2521
|
+
"<|end|>",
|
|
2522
|
+
};
|
|
2523
|
+
|
|
2524
|
+
// TODO: Tool calling
|
|
2525
|
+
|
|
2526
|
+
return data;
|
|
2527
|
+
}
|
|
2528
|
+
|
|
2507
2529
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2508
2530
|
common_chat_params data;
|
|
2509
2531
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2767,6 +2789,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2767
2789
|
return common_chat_params_init_magistral(tmpl, params);
|
|
2768
2790
|
}
|
|
2769
2791
|
|
|
2792
|
+
// Solar Open
|
|
2793
|
+
if (src.find("<|tool_response:begin|>") != std::string::npos &&
|
|
2794
|
+
src.find("<|tool_response:name|>") != std::string::npos &&
|
|
2795
|
+
src.find("<|tool_response:result|>") != std::string::npos) {
|
|
2796
|
+
return common_chat_params_init_solar_open(tmpl, params);
|
|
2797
|
+
}
|
|
2798
|
+
|
|
2770
2799
|
// Plain handler (no tools)
|
|
2771
2800
|
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
2772
2801
|
return common_chat_params_init_without_tools(tmpl, params);
|
|
@@ -135,6 +135,7 @@ enum common_chat_format {
|
|
|
135
135
|
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
|
|
136
136
|
COMMON_CHAT_FORMAT_APRIEL_1_5,
|
|
137
137
|
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
|
138
|
+
COMMON_CHAT_FORMAT_SOLAR_OPEN,
|
|
138
139
|
|
|
139
140
|
// These are intended to be parsed by the PEG parser
|
|
140
141
|
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
|
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|
|
251
251
|
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
|
252
252
|
}
|
|
253
253
|
|
|
254
|
-
if (
|
|
254
|
+
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
|
|
255
255
|
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
|
256
256
|
return false;
|
|
257
257
|
}
|
|
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
|
|
|
1078
1078
|
impl() = default;
|
|
1079
1079
|
~impl() = default;
|
|
1080
1080
|
|
|
1081
|
+
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
|
|
1082
|
+
|
|
1081
1083
|
llama_model_ptr model;
|
|
1082
1084
|
llama_context_ptr context;
|
|
1083
1085
|
|
|
@@ -1107,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1107
1109
|
|
|
1108
1110
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1109
1111
|
|
|
1112
|
+
// load and optionally apply lora adapters (must be loaded before context creation)
|
|
1113
|
+
for (auto & la : params.lora_adapters) {
|
|
1114
|
+
llama_adapter_lora_ptr lora;
|
|
1115
|
+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
1116
|
+
if (lora == nullptr) {
|
|
1117
|
+
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
|
|
1118
|
+
pimpl->model.reset(model);
|
|
1119
|
+
return;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
char buf[1024];
|
|
1123
|
+
la.ptr = lora.get();
|
|
1124
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
1125
|
+
la.task_name = buf;
|
|
1126
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
1127
|
+
la.prompt_prefix = buf;
|
|
1128
|
+
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1110
1131
|
// updates params.sampling
|
|
1111
1132
|
// TODO: fix naming
|
|
1112
1133
|
common_init_sampler_from_model(model, params.sampling);
|
|
@@ -1243,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|
|
1243
1264
|
}
|
|
1244
1265
|
}
|
|
1245
1266
|
|
|
1246
|
-
// load and optionally apply lora adapters
|
|
1247
|
-
for (auto & la : params.lora_adapters) {
|
|
1248
|
-
llama_adapter_lora_ptr lora;
|
|
1249
|
-
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
1250
|
-
if (lora == nullptr) {
|
|
1251
|
-
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
1252
|
-
return res;
|
|
1253
|
-
}
|
|
1254
|
-
|
|
1255
|
-
char buf[1024];
|
|
1256
|
-
la.ptr = lora.get();
|
|
1257
|
-
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
1258
|
-
la.task_name = buf;
|
|
1259
|
-
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
1260
|
-
la.prompt_prefix = buf;
|
|
1261
|
-
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
1262
|
-
}
|
|
1263
|
-
|
|
1264
1267
|
if (!params.lora_init_without_apply) {
|
|
1265
1268
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
|
1266
1269
|
}
|
|
@@ -1339,11 +1342,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1339
1342
|
mparams.devices = params.devices.data();
|
|
1340
1343
|
}
|
|
1341
1344
|
|
|
1342
|
-
if (params.n_gpu_layers != -1) {
|
|
1343
|
-
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1344
|
-
}
|
|
1345
|
-
|
|
1346
1345
|
mparams.vocab_only = params.vocab_only;
|
|
1346
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1347
1347
|
mparams.main_gpu = params.main_gpu;
|
|
1348
1348
|
mparams.split_mode = params.split_mode;
|
|
1349
1349
|
mparams.tensor_split = params.tensor_split;
|
|
@@ -330,7 +330,7 @@ struct common_params {
|
|
|
330
330
|
// offload params
|
|
331
331
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
332
332
|
|
|
333
|
-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM
|
|
333
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
|
334
334
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
335
335
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
336
336
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
|
|
4
4
|
### GGML Version
|
|
5
5
|
set(GGML_VERSION_MAJOR 0)
|
|
6
6
|
set(GGML_VERSION_MINOR 9)
|
|
7
|
-
set(GGML_VERSION_PATCH
|
|
7
|
+
set(GGML_VERSION_PATCH 5)
|
|
8
8
|
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
|
9
9
|
|
|
10
10
|
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
@@ -430,10 +430,22 @@ if (MSVC)
|
|
|
430
430
|
configure_msvc_target(ggml-cpu-x64)
|
|
431
431
|
configure_msvc_target(ggml-cpu-sse42)
|
|
432
432
|
configure_msvc_target(ggml-cpu-sandybridge)
|
|
433
|
+
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
434
|
+
# skipping ggml-cpu-ivybridge
|
|
435
|
+
# skipping ggml-cpu-piledriver
|
|
433
436
|
configure_msvc_target(ggml-cpu-haswell)
|
|
434
437
|
configure_msvc_target(ggml-cpu-skylakex)
|
|
438
|
+
configure_msvc_target(ggml-cpu-cannonlake)
|
|
439
|
+
configure_msvc_target(ggml-cpu-cascadelake)
|
|
435
440
|
configure_msvc_target(ggml-cpu-icelake)
|
|
441
|
+
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
|
442
|
+
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
|
443
|
+
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
|
444
|
+
# skipping ggml-cpu-cooperlake
|
|
445
|
+
# skipping ggml-cpu-zen4
|
|
436
446
|
configure_msvc_target(ggml-cpu-alderlake)
|
|
447
|
+
# MSVC doesn't support AMX
|
|
448
|
+
# skipping ggml-cpu-sapphirerapids
|
|
437
449
|
|
|
438
450
|
if (GGML_BUILD_EXAMPLES)
|
|
439
451
|
configure_msvc_target(common-ggml)
|
|
@@ -358,7 +358,7 @@ extern "C" {
|
|
|
358
358
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
|
359
359
|
|
|
360
360
|
// Compare the output of two backends
|
|
361
|
-
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor *
|
|
361
|
+
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
|
|
362
362
|
|
|
363
363
|
// Tensor initialization
|
|
364
364
|
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
357
357
|
endif()
|
|
358
358
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
359
359
|
ggml_add_cpu_backend_variant(x64)
|
|
360
|
-
ggml_add_cpu_backend_variant(sse42
|
|
361
|
-
ggml_add_cpu_backend_variant(sandybridge
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
360
|
+
ggml_add_cpu_backend_variant(sse42 SSE42)
|
|
361
|
+
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
|
362
|
+
if (NOT MSVC)
|
|
363
|
+
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
364
|
+
ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
|
|
365
|
+
ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
|
|
366
|
+
endif()
|
|
367
|
+
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
|
|
368
|
+
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
|
|
369
|
+
ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
|
|
370
|
+
ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
|
|
371
|
+
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
|
|
372
|
+
if (NOT MSVC)
|
|
373
|
+
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
|
374
|
+
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
|
375
|
+
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
|
376
|
+
ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
|
|
377
|
+
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
|
|
378
|
+
endif()
|
|
379
|
+
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
|
|
366
380
|
if (NOT MSVC)
|
|
367
381
|
# MSVC doesn't support AMX
|
|
368
|
-
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2
|
|
382
|
+
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
369
383
|
endif()
|
|
370
384
|
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
371
385
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
@@ -387,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
387
401
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
388
402
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
389
403
|
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
|
390
|
-
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
|
391
|
-
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
404
|
+
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
405
|
+
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
|
|
392
406
|
elseif (APPLE)
|
|
393
407
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
394
408
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
@@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
561
561
|
|
|
562
562
|
# Fetch KleidiAI sources:
|
|
563
563
|
include(FetchContent)
|
|
564
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
564
|
+
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
|
|
565
565
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
566
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
566
|
+
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
|
|
567
567
|
|
|
568
568
|
if (POLICY CMP0135)
|
|
569
569
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
615
615
|
string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
|
|
616
616
|
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
|
617
617
|
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
|
618
|
+
string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
|
|
618
619
|
|
|
619
620
|
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
|
|
620
621
|
|
|
@@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
659
660
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
|
660
661
|
endif()
|
|
661
662
|
|
|
663
|
+
if (NOT SVE_ENABLED MATCHES -1)
|
|
664
|
+
list(APPEND GGML_KLEIDIAI_SOURCES
|
|
665
|
+
${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
|
|
666
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
|
|
667
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
|
|
668
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
|
|
669
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
|
|
670
|
+
endif()
|
|
671
|
+
|
|
662
672
|
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
|
663
673
|
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
|
|
664
674
|
endif()
|
|
@@ -328,7 +328,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
|
328
328
|
|
|
329
329
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
330
330
|
#include <intrin.h>
|
|
331
|
-
#elif defined(
|
|
331
|
+
#elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
|
|
332
332
|
#include <immintrin.h>
|
|
333
333
|
#endif
|
|
334
334
|
|