@fugood/llama.node 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +33 -11
- package/src/llama.cpp/CMakeLists.txt +1 -0
- package/src/llama.cpp/common/CMakeLists.txt +46 -2
- package/src/llama.cpp/common/arg.cpp +423 -186
- package/src/llama.cpp/common/arg.h +0 -1
- package/src/llama.cpp/common/chat-parser.cpp +154 -13
- package/src/llama.cpp/common/chat-parser.h +3 -0
- package/src/llama.cpp/common/chat.cpp +217 -6
- package/src/llama.cpp/common/chat.h +5 -3
- package/src/llama.cpp/common/common.cpp +23 -6
- package/src/llama.cpp/common/common.h +6 -4
- package/src/llama.cpp/common/http.h +73 -0
- package/src/llama.cpp/common/sampling.cpp +1 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
- package/src/llama.cpp/include/llama.h +23 -11
- package/src/llama.cpp/src/llama-arch.cpp +93 -0
- package/src/llama.cpp/src/llama-arch.h +22 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -1
- package/src/llama.cpp/src/llama-context.cpp +157 -0
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-graph.cpp +57 -22
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.h +17 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
- package/src/llama.cpp/src/llama-kv-cache.h +2 -0
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
- package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model.cpp +582 -45
- package/src/llama.cpp/src/llama-model.h +23 -1
- package/src/llama.cpp/src/llama-sampling.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +41 -40
- package/src/llama.cpp/src/unicode.h +43 -0
|
@@ -379,7 +379,7 @@ struct common_params {
|
|
|
379
379
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
380
380
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
381
381
|
bool no_perf = false; // disable performance metrics
|
|
382
|
-
bool ctx_shift = false;
|
|
382
|
+
bool ctx_shift = false; // context shift on infinite text generation
|
|
383
383
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
384
384
|
bool kv_unified = false; // enable unified KV cache
|
|
385
385
|
|
|
@@ -393,6 +393,7 @@ struct common_params {
|
|
|
393
393
|
bool check_tensors = false; // validate tensor data
|
|
394
394
|
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
|
395
395
|
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
|
396
|
+
bool no_host = false; // bypass host buffer allowing extra buffers to be used
|
|
396
397
|
|
|
397
398
|
bool single_turn = false; // single turn chat conversation
|
|
398
399
|
|
|
@@ -425,7 +426,8 @@ struct common_params {
|
|
|
425
426
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
426
427
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
427
428
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
428
|
-
int32_t
|
|
429
|
+
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
430
|
+
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
|
|
429
431
|
|
|
430
432
|
std::string hostname = "127.0.0.1";
|
|
431
433
|
std::string public_path = ""; // NOLINT
|
|
@@ -433,7 +435,7 @@ struct common_params {
|
|
|
433
435
|
std::string chat_template = ""; // NOLINT
|
|
434
436
|
bool use_jinja = false; // NOLINT
|
|
435
437
|
bool enable_chat_template = true;
|
|
436
|
-
common_reasoning_format reasoning_format =
|
|
438
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
437
439
|
int reasoning_budget = -1;
|
|
438
440
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
|
439
441
|
|
|
@@ -739,7 +741,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
739
741
|
// MoE utils
|
|
740
742
|
//
|
|
741
743
|
|
|
742
|
-
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)
|
|
744
|
+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
|
|
743
745
|
|
|
744
746
|
static std::string llm_ffn_exps_block_regex(int idx) {
|
|
745
747
|
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cpp-httplib/httplib.h>
|
|
4
|
+
|
|
5
|
+
struct common_http_url {
|
|
6
|
+
std::string scheme;
|
|
7
|
+
std::string user;
|
|
8
|
+
std::string password;
|
|
9
|
+
std::string host;
|
|
10
|
+
std::string path;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
static common_http_url common_http_parse_url(const std::string & url) {
|
|
14
|
+
common_http_url parts;
|
|
15
|
+
auto scheme_end = url.find("://");
|
|
16
|
+
|
|
17
|
+
if (scheme_end == std::string::npos) {
|
|
18
|
+
throw std::runtime_error("invalid URL: no scheme");
|
|
19
|
+
}
|
|
20
|
+
parts.scheme = url.substr(0, scheme_end);
|
|
21
|
+
|
|
22
|
+
if (parts.scheme != "http" && parts.scheme != "https") {
|
|
23
|
+
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
auto rest = url.substr(scheme_end + 3);
|
|
27
|
+
auto at_pos = rest.find('@');
|
|
28
|
+
|
|
29
|
+
if (at_pos != std::string::npos) {
|
|
30
|
+
auto auth = rest.substr(0, at_pos);
|
|
31
|
+
auto colon_pos = auth.find(':');
|
|
32
|
+
if (colon_pos != std::string::npos) {
|
|
33
|
+
parts.user = auth.substr(0, colon_pos);
|
|
34
|
+
parts.password = auth.substr(colon_pos + 1);
|
|
35
|
+
} else {
|
|
36
|
+
parts.user = auth;
|
|
37
|
+
}
|
|
38
|
+
rest = rest.substr(at_pos + 1);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
auto slash_pos = rest.find('/');
|
|
42
|
+
|
|
43
|
+
if (slash_pos != std::string::npos) {
|
|
44
|
+
parts.host = rest.substr(0, slash_pos);
|
|
45
|
+
parts.path = rest.substr(slash_pos);
|
|
46
|
+
} else {
|
|
47
|
+
parts.host = rest;
|
|
48
|
+
parts.path = "/";
|
|
49
|
+
}
|
|
50
|
+
return parts;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
|
|
54
|
+
common_http_url parts = common_http_parse_url(url);
|
|
55
|
+
|
|
56
|
+
if (parts.host.empty()) {
|
|
57
|
+
throw std::runtime_error("error: invalid URL format");
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
httplib::Client cli(parts.scheme + "://" + parts.host);
|
|
61
|
+
|
|
62
|
+
if (!parts.user.empty()) {
|
|
63
|
+
cli.set_basic_auth(parts.user, parts.password);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
cli.set_follow_location(true);
|
|
67
|
+
|
|
68
|
+
return { std::move(cli), std::move(parts) };
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
static std::string common_http_show_masked_url(const common_http_url & parts) {
|
|
72
|
+
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
|
|
73
|
+
}
|
|
@@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
|
|
|
4
4
|
### GGML Version
|
|
5
5
|
set(GGML_VERSION_MAJOR 0)
|
|
6
6
|
set(GGML_VERSION_MINOR 9)
|
|
7
|
-
set(GGML_VERSION_PATCH
|
|
8
|
-
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
|
|
7
|
+
set(GGML_VERSION_PATCH 4)
|
|
9
8
|
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
|
10
9
|
|
|
11
10
|
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
@@ -26,8 +25,8 @@ if(GIT_EXE)
|
|
|
26
25
|
)
|
|
27
26
|
endif()
|
|
28
27
|
|
|
29
|
-
# Build the version string with optional
|
|
30
|
-
set(GGML_VERSION "${GGML_VERSION_BASE}
|
|
28
|
+
# Build the version string with optional dirty flag
|
|
29
|
+
set(GGML_VERSION "${GGML_VERSION_BASE}")
|
|
31
30
|
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
|
|
32
31
|
set(GGML_VERSION "${GGML_VERSION}-dirty")
|
|
33
32
|
endif()
|
|
@@ -177,7 +176,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
|
177
176
|
|
|
178
177
|
|
|
179
178
|
if (MINGW)
|
|
180
|
-
set(GGML_WIN_VER "
|
|
179
|
+
set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
|
|
181
180
|
endif()
|
|
182
181
|
|
|
183
182
|
# ggml core
|
|
@@ -210,7 +209,6 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
210
209
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
211
210
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
212
211
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
213
|
-
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
214
212
|
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
|
215
213
|
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
|
216
214
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
@@ -224,6 +222,9 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
|
|
224
222
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
225
223
|
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
226
224
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
225
|
+
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
|
226
|
+
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
|
|
227
|
+
|
|
227
228
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
228
229
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
229
230
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
@@ -215,6 +215,8 @@ extern "C" {
|
|
|
215
215
|
// Backend registry
|
|
216
216
|
//
|
|
217
217
|
|
|
218
|
+
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
|
|
219
|
+
|
|
218
220
|
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
|
219
221
|
|
|
220
222
|
// Backend (reg) enumeration
|
|
@@ -314,7 +316,8 @@ extern "C" {
|
|
|
314
316
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
|
315
317
|
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
|
316
318
|
|
|
317
|
-
GGML_API
|
|
319
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
320
|
+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
|
318
321
|
|
|
319
322
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
320
323
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
@@ -7,26 +7,25 @@
|
|
|
7
7
|
extern "C" {
|
|
8
8
|
#endif
|
|
9
9
|
|
|
10
|
-
#define RPC_PROTO_MAJOR_VERSION
|
|
10
|
+
#define RPC_PROTO_MAJOR_VERSION 3
|
|
11
11
|
#define RPC_PROTO_MINOR_VERSION 0
|
|
12
12
|
#define RPC_PROTO_PATCH_VERSION 0
|
|
13
13
|
#define GGML_RPC_MAX_SERVERS 16
|
|
14
14
|
|
|
15
15
|
// backend API
|
|
16
|
-
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
|
16
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
|
|
17
17
|
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
|
18
18
|
|
|
19
|
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
|
19
|
+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
|
|
20
20
|
|
|
21
|
-
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
|
21
|
+
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
|
|
22
22
|
|
|
23
|
-
GGML_BACKEND_API void ggml_backend_rpc_start_server(
|
|
24
|
-
|
|
25
|
-
size_t free_mem, size_t total_mem);
|
|
23
|
+
GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
|
|
24
|
+
size_t n_threads, size_t n_devices,
|
|
25
|
+
ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
|
|
26
26
|
|
|
27
27
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
|
28
|
-
|
|
29
|
-
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
|
28
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
|
|
30
29
|
|
|
31
30
|
#ifdef __cplusplus
|
|
32
31
|
}
|
|
@@ -237,6 +237,8 @@
|
|
|
237
237
|
#define GGML_EXIT_SUCCESS 0
|
|
238
238
|
#define GGML_EXIT_ABORTED 1
|
|
239
239
|
|
|
240
|
+
// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
|
|
241
|
+
#define GGML_ROPE_TYPE_NORMAL 0
|
|
240
242
|
#define GGML_ROPE_TYPE_NEOX 2
|
|
241
243
|
#define GGML_ROPE_TYPE_MROPE 8
|
|
242
244
|
#define GGML_ROPE_TYPE_VISION 24
|
|
@@ -574,6 +576,7 @@ extern "C" {
|
|
|
574
576
|
GGML_UNARY_OP_HARDSIGMOID,
|
|
575
577
|
GGML_UNARY_OP_EXP,
|
|
576
578
|
GGML_UNARY_OP_GELU_ERF,
|
|
579
|
+
GGML_UNARY_OP_XIELU,
|
|
577
580
|
|
|
578
581
|
GGML_UNARY_OP_COUNT,
|
|
579
582
|
};
|
|
@@ -1148,6 +1151,18 @@ extern "C" {
|
|
|
1148
1151
|
struct ggml_context * ctx,
|
|
1149
1152
|
struct ggml_tensor * a);
|
|
1150
1153
|
|
|
1154
|
+
// xIELU activation function
|
|
1155
|
+
// x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
|
|
1156
|
+
// where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
|
|
1157
|
+
// that constrain the positive and negative source alpha values respectively
|
|
1158
|
+
GGML_API struct ggml_tensor * ggml_xielu(
|
|
1159
|
+
struct ggml_context * ctx,
|
|
1160
|
+
struct ggml_tensor * a,
|
|
1161
|
+
float alpha_n,
|
|
1162
|
+
float alpha_p,
|
|
1163
|
+
float beta,
|
|
1164
|
+
float eps);
|
|
1165
|
+
|
|
1151
1166
|
// gated linear unit ops
|
|
1152
1167
|
// A: n columns, r rows,
|
|
1153
1168
|
// result is n / 2 columns, r rows,
|
|
@@ -1615,6 +1630,13 @@ extern "C" {
|
|
|
1615
1630
|
float scale,
|
|
1616
1631
|
float max_bias);
|
|
1617
1632
|
|
|
1633
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
|
|
1634
|
+
struct ggml_context * ctx,
|
|
1635
|
+
struct ggml_tensor * a,
|
|
1636
|
+
struct ggml_tensor * mask,
|
|
1637
|
+
float scale,
|
|
1638
|
+
float max_bias);
|
|
1639
|
+
|
|
1618
1640
|
GGML_API void ggml_soft_max_add_sinks(
|
|
1619
1641
|
struct ggml_tensor * a,
|
|
1620
1642
|
struct ggml_tensor * sinks);
|
|
@@ -145,6 +145,9 @@ endif()
|
|
|
145
145
|
# which was introduced in POSIX.1-2008, forcing us to go higher
|
|
146
146
|
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
|
147
147
|
add_compile_definitions(_XOPEN_SOURCE=700)
|
|
148
|
+
elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
|
|
149
|
+
# Don't define _XOPEN_SOURCE. We need _ALL_SOURCE, which is the default,
|
|
150
|
+
# in order to define _SC_PHYS_PAGES.
|
|
148
151
|
else()
|
|
149
152
|
add_compile_definitions(_XOPEN_SOURCE=600)
|
|
150
153
|
endif()
|
|
@@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
439
439
|
ggml-cpu/arch/riscv/quants.c
|
|
440
440
|
ggml-cpu/arch/riscv/repack.cpp
|
|
441
441
|
)
|
|
442
|
+
if (GGML_CPU_RISCV64_SPACEMIT)
|
|
443
|
+
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
|
|
444
|
+
list(APPEND GGML_CPU_SOURCES
|
|
445
|
+
ggml-cpu/spacemit/ime.cpp
|
|
446
|
+
ggml-cpu/spacemit/ime.h
|
|
447
|
+
ggml-cpu/spacemit/ime1_kernels.cpp
|
|
448
|
+
ggml-cpu/spacemit/ime_kernels.h
|
|
449
|
+
)
|
|
450
|
+
endif()
|
|
442
451
|
set(MARCH_STR "rv64gc")
|
|
443
452
|
if (GGML_RV_ZFH)
|
|
444
453
|
string(APPEND MARCH_STR "_zfh")
|
|
@@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
504
513
|
|
|
505
514
|
# Fetch KleidiAI sources:
|
|
506
515
|
include(FetchContent)
|
|
507
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
516
|
+
set(KLEIDIAI_COMMIT_TAG "v1.14.0")
|
|
508
517
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
509
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
518
|
+
set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
|
|
510
519
|
|
|
511
520
|
if (POLICY CMP0135)
|
|
512
521
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
583
592
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
|
584
593
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
|
585
594
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
|
595
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
|
|
586
596
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
|
587
597
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
|
588
598
|
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
|
@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
|
|
149
149
|
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
|
150
150
|
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
|
151
151
|
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
|
152
|
+
op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
|
|
152
153
|
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
|
153
154
|
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
|
|
154
155
|
// src1 must be host buffer
|
|
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
|
105
105
|
|
|
106
106
|
return ((v4f32)res)[0];
|
|
107
107
|
}
|
|
108
|
+
|
|
109
|
+
// multiply int8_t, add results pairwise twice
|
|
110
|
+
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
111
|
+
// Get absolute values of x vectors
|
|
112
|
+
const __m128i ax = __lsx_vsigncov_b(x, x);
|
|
113
|
+
// Sign the values of the y vectors
|
|
114
|
+
const __m128i sy = __lsx_vsigncov_b(x, y);
|
|
115
|
+
// Perform multiplication and create 16-bit values
|
|
116
|
+
const __m128i dot = lsx_maddubs_h(ax, sy);
|
|
117
|
+
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
|
118
|
+
return lsx_madd_h(ones, dot);
|
|
119
|
+
}
|
|
108
120
|
#endif
|
|
109
121
|
|
|
110
122
|
#if defined(__loongarch_asx)
|
|
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
|
|
323
335
|
}
|
|
324
336
|
}
|
|
325
337
|
|
|
326
|
-
// multiply int8_t, add results pairwise twice
|
|
327
|
-
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
328
|
-
// Get absolute values of x vectors
|
|
329
|
-
const __m128i ax = __lsx_vsigncov_b(x, x);
|
|
330
|
-
// Sign the values of the y vectors
|
|
331
|
-
const __m128i sy = __lsx_vsigncov_b(x, y);
|
|
332
|
-
// Perform multiplication and create 16-bit values
|
|
333
|
-
const __m128i dot = lsx_maddubs_h(ax, sy);
|
|
334
|
-
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
|
335
|
-
return lsx_madd_h(ones, dot);
|
|
336
|
-
}
|
|
337
|
-
|
|
338
338
|
// horizontally add 8 floats
|
|
339
339
|
static inline float hsum_float_8(const __m256 x) {
|
|
340
340
|
__m128 res = lasx_extractf128(x, 1);
|
|
@@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
75
75
|
|
|
76
76
|
for (int j = 0; j < 8; j++) {
|
|
77
77
|
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
78
|
-
|
|
78
|
+
/* Uses non-default rounding for vec_signed or vec_round */
|
|
79
|
+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
|
79
80
|
|
|
80
81
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
81
82
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
122
123
|
|
|
123
124
|
for (int j = 0; j < 8; j++) {
|
|
124
125
|
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
125
|
-
|
|
126
|
+
/* Uses non-default rounding for vec_signed or vec_round */
|
|
127
|
+
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
|
|
126
128
|
|
|
127
129
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
128
130
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
260
262
|
#endif
|
|
261
263
|
}
|
|
262
264
|
|
|
265
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
266
|
+
assert(nrc == 1);
|
|
267
|
+
UNUSED(nrc);
|
|
268
|
+
UNUSED(bx);
|
|
269
|
+
UNUSED(by);
|
|
270
|
+
UNUSED(bs);
|
|
271
|
+
assert(n % QK_MXFP4 == 0);
|
|
272
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
273
|
+
|
|
274
|
+
const int qk = QK_MXFP4;
|
|
275
|
+
const int nb = n / qk;
|
|
276
|
+
|
|
277
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
278
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
279
|
+
|
|
280
|
+
int ib = 0;
|
|
281
|
+
float sumf = 0.0f;
|
|
282
|
+
|
|
283
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
284
|
+
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
|
|
285
|
+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
|
286
|
+
|
|
287
|
+
float32x4_t v_acc = vec_splats(0.0f);
|
|
288
|
+
|
|
289
|
+
#pragma GCC unroll 8
|
|
290
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
291
|
+
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
292
|
+
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
|
|
293
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
294
|
+
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
|
|
295
|
+
|
|
296
|
+
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
|
|
297
|
+
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
|
|
298
|
+
|
|
299
|
+
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
|
|
300
|
+
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
|
|
301
|
+
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
|
|
302
|
+
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
|
|
303
|
+
|
|
304
|
+
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
|
|
305
|
+
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
|
|
306
|
+
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
|
|
307
|
+
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
|
|
308
|
+
|
|
309
|
+
const int8x16_t v_y0l = vec_xl(0, y0->qs);
|
|
310
|
+
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
|
|
311
|
+
const int8x16_t v_y1l = vec_xl(0, y1->qs);
|
|
312
|
+
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
|
|
313
|
+
|
|
314
|
+
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
|
|
315
|
+
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
|
|
316
|
+
|
|
317
|
+
const float32x4_t v_xy0f = vec_float(v_xy0);
|
|
318
|
+
const float32x4_t v_xy1f = vec_float(v_xy1);
|
|
319
|
+
|
|
320
|
+
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
321
|
+
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
|
|
322
|
+
|
|
323
|
+
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
|
|
324
|
+
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
for (; ib < nb; ++ib) {
|
|
328
|
+
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
|
|
329
|
+
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
|
|
330
|
+
|
|
331
|
+
const uint8x16_t v_x = vec_xl(0, x0->qs);
|
|
332
|
+
|
|
333
|
+
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
|
|
334
|
+
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
|
|
335
|
+
|
|
336
|
+
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
|
|
337
|
+
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
|
|
338
|
+
|
|
339
|
+
const int8x16_t v_yl = vec_xl(0, y0->qs);
|
|
340
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
341
|
+
|
|
342
|
+
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
343
|
+
const float32x4_t v_xyf = vec_float(v_xy);
|
|
344
|
+
|
|
345
|
+
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
346
|
+
v_acc = vec_madd(v_xyf, v_d, v_acc);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
sumf = vec_hsum_f32x4(v_acc);
|
|
350
|
+
*s = sumf;
|
|
351
|
+
#else
|
|
352
|
+
UNUSED(x);
|
|
353
|
+
UNUSED(y);
|
|
354
|
+
UNUSED(ib);
|
|
355
|
+
UNUSED(sumf);
|
|
356
|
+
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
357
|
+
#endif
|
|
358
|
+
}
|
|
359
|
+
|
|
263
360
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
264
361
|
const int qk = QK8_0;
|
|
265
362
|
const int nb = n / qk;
|
|
@@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
636
733
|
uint8x16_t q3h[4];
|
|
637
734
|
uint8x16_t q3b[2];
|
|
638
735
|
int8x16_t q3bytes[4];
|
|
639
|
-
int8x16_t q8bytes[
|
|
736
|
+
int8x16_t q8bytes[8];
|
|
640
737
|
uint8x16_t qhbits[2];
|
|
641
738
|
|
|
642
739
|
float sum = 0;
|
|
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
878
878
|
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
|
|
879
879
|
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
|
|
880
880
|
|
|
881
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
881
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
882
882
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
883
883
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
884
884
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
|
|
1231
1231
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
|
1232
1232
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
|
1233
1233
|
|
|
1234
|
-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of
|
|
1234
|
+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
|
1235
1235
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
|
1236
1236
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
|
1237
1237
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
|
@@ -160,7 +160,6 @@
|
|
|
160
160
|
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
|
161
161
|
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
|
162
162
|
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
|
163
|
-
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
|
164
163
|
// repack.cpp
|
|
165
164
|
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
|
166
165
|
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
|
@@ -473,10 +473,10 @@ struct ggml_threadpool {
|
|
|
473
473
|
struct ggml_compute_state {
|
|
474
474
|
#ifndef GGML_USE_OPENMP
|
|
475
475
|
ggml_thread_t thrd;
|
|
476
|
-
bool cpumask[GGML_MAX_N_THREADS];
|
|
477
476
|
int last_graph;
|
|
478
477
|
bool pending;
|
|
479
478
|
#endif
|
|
479
|
+
bool cpumask[GGML_MAX_N_THREADS];
|
|
480
480
|
struct ggml_threadpool * threadpool;
|
|
481
481
|
int ith;
|
|
482
482
|
};
|
|
@@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2187
2187
|
case GGML_UNARY_OP_GELU_ERF:
|
|
2188
2188
|
case GGML_UNARY_OP_GELU_QUICK:
|
|
2189
2189
|
case GGML_UNARY_OP_SILU:
|
|
2190
|
+
case GGML_UNARY_OP_XIELU:
|
|
2190
2191
|
{
|
|
2191
2192
|
n_tasks = n_threads;
|
|
2192
2193
|
} break;
|
|
@@ -3081,7 +3082,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|
|
3081
3082
|
|
|
3082
3083
|
threadpool->workers = workers;
|
|
3083
3084
|
|
|
3084
|
-
#
|
|
3085
|
+
#ifdef GGML_USE_OPENMP
|
|
3086
|
+
int32_t cpumask_iter = 0;
|
|
3087
|
+
|
|
3088
|
+
// Compute CPU masks for each thread
|
|
3089
|
+
for (int j = 0; j < tpp->n_threads; j++) {
|
|
3090
|
+
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
|
|
3091
|
+
}
|
|
3092
|
+
#else // GGML_USE_OPENMP
|
|
3085
3093
|
ggml_mutex_init(&threadpool->mutex);
|
|
3086
3094
|
ggml_cond_init(&threadpool->cond);
|
|
3087
3095
|
|
|
@@ -3154,7 +3162,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
3154
3162
|
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
|
|
3155
3163
|
}
|
|
3156
3164
|
|
|
3157
|
-
|
|
3165
|
+
// Apply thread CPU mask and priority
|
|
3166
|
+
int ith = omp_get_thread_num();
|
|
3167
|
+
|
|
3168
|
+
ggml_thread_apply_priority(threadpool->prio);
|
|
3169
|
+
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
|
|
3170
|
+
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
|
|
3171
|
+
}
|
|
3172
|
+
ggml_graph_compute_thread(&threadpool->workers[ith]);
|
|
3158
3173
|
}
|
|
3159
3174
|
} else {
|
|
3160
3175
|
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
|
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
# include "kleidiai/kleidiai.h"
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
+
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
|
22
|
+
# include "spacemit/ime.h"
|
|
23
|
+
#endif
|
|
24
|
+
|
|
21
25
|
#if defined(_WIN32)
|
|
22
26
|
# define WIN32_LEAN_AND_MEAN
|
|
23
27
|
# ifndef NOMINMAX
|
|
@@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
|
|
|
45
49
|
}
|
|
46
50
|
#endif
|
|
47
51
|
|
|
52
|
+
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
|
|
53
|
+
if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
|
|
54
|
+
bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
|
|
55
|
+
}
|
|
56
|
+
#endif
|
|
57
|
+
|
|
48
58
|
#ifdef GGML_USE_CPU_KLEIDIAI
|
|
49
59
|
if (ggml_backend_cpu_kleidiai_buffer_type()) {
|
|
50
60
|
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
|