@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -306,7 +306,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
|
|
306
306
|
}
|
|
307
307
|
}
|
|
308
308
|
} else {
|
|
309
|
-
jmsg["content"] =
|
|
309
|
+
jmsg["content"] = "";
|
|
310
310
|
}
|
|
311
311
|
if (!msg.reasoning_content.empty()) {
|
|
312
312
|
jmsg["reasoning_content"] = msg.reasoning_content;
|
|
@@ -367,8 +367,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
|
|
367
367
|
const auto & function = tool.at("function");
|
|
368
368
|
result.push_back({
|
|
369
369
|
/* .name = */ function.at("name"),
|
|
370
|
-
/* .description = */ function.
|
|
371
|
-
/* .parameters = */ function.
|
|
370
|
+
/* .description = */ function.value("description", ""),
|
|
371
|
+
/* .parameters = */ function.value("parameters", json::object()).dump(),
|
|
372
372
|
});
|
|
373
373
|
}
|
|
374
374
|
}
|
|
@@ -656,6 +656,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
656
656
|
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
|
|
657
657
|
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
|
658
658
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
|
659
|
+
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
|
|
659
660
|
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
|
|
660
661
|
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
|
|
661
662
|
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
|
|
@@ -2051,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
2051
2052
|
// Trigger on tool calls that appear in the commentary channel
|
|
2052
2053
|
data.grammar_triggers.push_back({
|
|
2053
2054
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2054
|
-
"<\\|channel\\|>(commentary|analysis) to"
|
|
2055
|
+
"<\\|channel\\|>(?:commentary|analysis) to"
|
|
2055
2056
|
});
|
|
2056
2057
|
|
|
2057
2058
|
// Trigger tool calls that appear in the role section, either at the
|
|
@@ -2384,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
2384
2385
|
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
|
|
2385
2386
|
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
|
|
2386
2387
|
data.grammar_triggers.push_back({
|
|
2387
|
-
|
|
2388
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2388
2389
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
2389
2390
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
2390
|
-
std::string(data.thinking_forced_open ? "
|
|
2391
|
+
std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
|
|
2391
2392
|
"\\s*("
|
|
2392
2393
|
"(?:<tool_call>"
|
|
2393
2394
|
"|<function"
|
|
2394
2395
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
2395
2396
|
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
|
|
2396
2397
|
")"
|
|
2397
|
-
")
|
|
2398
|
+
")"
|
|
2398
2399
|
),
|
|
2399
2400
|
});
|
|
2400
2401
|
data.preserved_tokens = {
|
|
@@ -2504,6 +2505,27 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
|
|
|
2504
2505
|
return data;
|
|
2505
2506
|
}
|
|
2506
2507
|
|
|
2508
|
+
static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2509
|
+
common_chat_params data;
|
|
2510
|
+
|
|
2511
|
+
// TODO: Reasoning effort
|
|
2512
|
+
json additional_context = {};
|
|
2513
|
+
|
|
2514
|
+
data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
|
|
2515
|
+
data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
|
|
2516
|
+
|
|
2517
|
+
data.preserved_tokens = {
|
|
2518
|
+
"<|think|>",
|
|
2519
|
+
"<|content|>",
|
|
2520
|
+
"<|begin|>",
|
|
2521
|
+
"<|end|>",
|
|
2522
|
+
};
|
|
2523
|
+
|
|
2524
|
+
// TODO: Tool calling
|
|
2525
|
+
|
|
2526
|
+
return data;
|
|
2527
|
+
}
|
|
2528
|
+
|
|
2507
2529
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2508
2530
|
common_chat_params data;
|
|
2509
2531
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2767,6 +2789,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2767
2789
|
return common_chat_params_init_magistral(tmpl, params);
|
|
2768
2790
|
}
|
|
2769
2791
|
|
|
2792
|
+
// Solar Open
|
|
2793
|
+
if (src.find("<|tool_response:begin|>") != std::string::npos &&
|
|
2794
|
+
src.find("<|tool_response:name|>") != std::string::npos &&
|
|
2795
|
+
src.find("<|tool_response:result|>") != std::string::npos) {
|
|
2796
|
+
return common_chat_params_init_solar_open(tmpl, params);
|
|
2797
|
+
}
|
|
2798
|
+
|
|
2770
2799
|
// Plain handler (no tools)
|
|
2771
2800
|
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
2772
2801
|
return common_chat_params_init_without_tools(tmpl, params);
|
|
@@ -135,6 +135,7 @@ enum common_chat_format {
|
|
|
135
135
|
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
|
|
136
136
|
COMMON_CHAT_FORMAT_APRIEL_1_5,
|
|
137
137
|
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
|
138
|
+
COMMON_CHAT_FORMAT_SOLAR_OPEN,
|
|
138
139
|
|
|
139
140
|
// These are intended to be parsed by the PEG parser
|
|
140
141
|
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
|
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|
|
251
251
|
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
|
252
252
|
}
|
|
253
253
|
|
|
254
|
-
if (
|
|
254
|
+
if (setpriority(PRIO_PROCESS, 0, p) != 0) {
|
|
255
255
|
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
|
256
256
|
return false;
|
|
257
257
|
}
|
|
@@ -1078,12 +1078,15 @@ struct common_init_result::impl {
|
|
|
1078
1078
|
impl() = default;
|
|
1079
1079
|
~impl() = default;
|
|
1080
1080
|
|
|
1081
|
+
// note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
|
|
1082
|
+
|
|
1081
1083
|
llama_model_ptr model;
|
|
1082
1084
|
llama_context_ptr context;
|
|
1083
1085
|
|
|
1084
1086
|
std::vector<llama_adapter_lora_ptr> lora;
|
|
1085
1087
|
|
|
1086
1088
|
std::vector<common_sampler_ptr> samplers;
|
|
1089
|
+
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
|
1087
1090
|
};
|
|
1088
1091
|
|
|
1089
1092
|
common_init_result::common_init_result(common_params & params) :
|
|
@@ -1107,6 +1110,25 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1107
1110
|
|
|
1108
1111
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1109
1112
|
|
|
1113
|
+
// load and optionally apply lora adapters (must be loaded before context creation)
|
|
1114
|
+
for (auto & la : params.lora_adapters) {
|
|
1115
|
+
llama_adapter_lora_ptr lora;
|
|
1116
|
+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
1117
|
+
if (lora == nullptr) {
|
|
1118
|
+
LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
|
|
1119
|
+
pimpl->model.reset(model);
|
|
1120
|
+
return;
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
char buf[1024];
|
|
1124
|
+
la.ptr = lora.get();
|
|
1125
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
1126
|
+
la.task_name = buf;
|
|
1127
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
1128
|
+
la.prompt_prefix = buf;
|
|
1129
|
+
pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1110
1132
|
// updates params.sampling
|
|
1111
1133
|
// TODO: fix naming
|
|
1112
1134
|
common_init_sampler_from_model(model, params.sampling);
|
|
@@ -1141,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1141
1163
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
1142
1164
|
//}
|
|
1143
1165
|
|
|
1166
|
+
// init the backend samplers as part of the context creation
|
|
1144
1167
|
pimpl->samplers.resize(cparams.n_seq_max);
|
|
1168
|
+
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
|
|
1145
1169
|
|
|
1146
1170
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
|
1147
1171
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
|
1172
|
+
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
// TODO: temporarily gated behind a flag
|
|
1176
|
+
if (params.sampling.backend_sampling) {
|
|
1177
|
+
cparams.samplers = pimpl->samplers_seq_config.data();
|
|
1178
|
+
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
|
1148
1179
|
}
|
|
1149
1180
|
|
|
1150
1181
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -1168,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
|
|
1168
1199
|
return pimpl->samplers[seq_id].get();
|
|
1169
1200
|
}
|
|
1170
1201
|
|
|
1202
|
+
void common_init_result::reset_samplers() {
|
|
1203
|
+
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
|
|
1204
|
+
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1171
1208
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
|
1172
1209
|
return pimpl->lora;
|
|
1173
1210
|
}
|
|
@@ -1243,24 +1280,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|
|
1243
1280
|
}
|
|
1244
1281
|
}
|
|
1245
1282
|
|
|
1246
|
-
// load and optionally apply lora adapters
|
|
1247
|
-
for (auto & la : params.lora_adapters) {
|
|
1248
|
-
llama_adapter_lora_ptr lora;
|
|
1249
|
-
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
1250
|
-
if (lora == nullptr) {
|
|
1251
|
-
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
1252
|
-
return res;
|
|
1253
|
-
}
|
|
1254
|
-
|
|
1255
|
-
char buf[1024];
|
|
1256
|
-
la.ptr = lora.get();
|
|
1257
|
-
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
1258
|
-
la.task_name = buf;
|
|
1259
|
-
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
1260
|
-
la.prompt_prefix = buf;
|
|
1261
|
-
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
1262
|
-
}
|
|
1263
|
-
|
|
1264
1283
|
if (!params.lora_init_without_apply) {
|
|
1265
1284
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
|
1266
1285
|
}
|
|
@@ -1301,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|
|
1301
1320
|
llama_synchronize(lctx);
|
|
1302
1321
|
llama_perf_context_reset(lctx);
|
|
1303
1322
|
llama_set_warmup(lctx, false);
|
|
1323
|
+
|
|
1324
|
+
// reset samplers to reset RNG state after warmup to the seeded state
|
|
1325
|
+
res->reset_samplers();
|
|
1304
1326
|
}
|
|
1305
1327
|
|
|
1306
1328
|
return res;
|
|
@@ -1339,11 +1361,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1339
1361
|
mparams.devices = params.devices.data();
|
|
1340
1362
|
}
|
|
1341
1363
|
|
|
1342
|
-
if (params.n_gpu_layers != -1) {
|
|
1343
|
-
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1344
|
-
}
|
|
1345
|
-
|
|
1346
1364
|
mparams.vocab_only = params.vocab_only;
|
|
1365
|
+
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1347
1366
|
mparams.main_gpu = params.main_gpu;
|
|
1348
1367
|
mparams.split_mode = params.split_mode;
|
|
1349
1368
|
mparams.tensor_split = params.tensor_split;
|
|
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
|
|
80
80
|
//
|
|
81
81
|
|
|
82
82
|
enum llama_example {
|
|
83
|
+
LLAMA_EXAMPLE_DEBUG,
|
|
83
84
|
LLAMA_EXAMPLE_COMMON,
|
|
84
85
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
85
86
|
LLAMA_EXAMPLE_COMPLETION,
|
|
@@ -216,6 +217,8 @@ struct common_params_sampling {
|
|
|
216
217
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
217
218
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
218
219
|
|
|
220
|
+
bool backend_sampling = false;
|
|
221
|
+
|
|
219
222
|
bool has_logit_bias() const {
|
|
220
223
|
return !logit_bias.empty();
|
|
221
224
|
}
|
|
@@ -330,7 +333,7 @@ struct common_params {
|
|
|
330
333
|
// offload params
|
|
331
334
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
332
335
|
|
|
333
|
-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM
|
|
336
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
|
334
337
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
335
338
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
336
339
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
@@ -371,6 +374,11 @@ struct common_params {
|
|
|
371
374
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
372
375
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
373
376
|
|
|
377
|
+
// llama-debug specific options
|
|
378
|
+
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
|
|
379
|
+
bool save_logits = false; // whether to save logits to files // NOLINT
|
|
380
|
+
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
|
|
381
|
+
|
|
374
382
|
std::vector<std::string> in_files; // all input files
|
|
375
383
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
|
376
384
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
@@ -690,7 +698,9 @@ struct common_init_result {
|
|
|
690
698
|
|
|
691
699
|
llama_model * model();
|
|
692
700
|
llama_context * context();
|
|
701
|
+
|
|
693
702
|
common_sampler * sampler(llama_seq_id seq_id);
|
|
703
|
+
void reset_samplers();
|
|
694
704
|
|
|
695
705
|
std::vector<llama_adapter_lora_ptr> & lora();
|
|
696
706
|
|
|
@@ -106,12 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
|
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
static llama_sampler_i llama_sampler_llg_i = {
|
|
109
|
-
/* .name
|
|
110
|
-
/* .accept
|
|
111
|
-
/* .apply
|
|
112
|
-
/* .reset
|
|
113
|
-
/* .clone
|
|
114
|
-
/* .free
|
|
109
|
+
/* .name = */ llama_sampler_llg_name,
|
|
110
|
+
/* .accept = */ llama_sampler_llg_accept_impl,
|
|
111
|
+
/* .apply = */ llama_sampler_llg_apply,
|
|
112
|
+
/* .reset = */ llama_sampler_llg_reset,
|
|
113
|
+
/* .clone = */ llama_sampler_llg_clone,
|
|
114
|
+
/* .free = */ llama_sampler_llg_free,
|
|
115
|
+
/* .backend_init = */ NULL,
|
|
116
|
+
/* .backend_accept = */ NULL,
|
|
117
|
+
/* .backend_apply = */ NULL,
|
|
118
|
+
/* .backend_set_input = */ NULL,
|
|
115
119
|
};
|
|
116
120
|
|
|
117
121
|
static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len,
|
|
@@ -27,7 +27,7 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
|
|
|
27
27
|
return res;
|
|
28
28
|
}
|
|
29
29
|
std::match_results<std::string::const_reverse_iterator> srmatch;
|
|
30
|
-
if (std::
|
|
30
|
+
if (std::regex_search(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial, std::regex_constants::match_continuous)) {
|
|
31
31
|
auto group = srmatch[1].str();
|
|
32
32
|
if (group.length() != 0) {
|
|
33
33
|
auto it = srmatch[1].second.base();
|
|
@@ -55,18 +55,18 @@ common_regex_match common_regex::search(const std::string & input, size_t pos, b
|
|
|
55
55
|
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
|
|
56
56
|
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
|
|
57
57
|
|
|
58
|
-
- /abcd/ -> (dcba|cba|ba|a)
|
|
59
|
-
- /a|b/ -> (a|b)
|
|
58
|
+
- /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:(?:d)?c)?b)?a)
|
|
59
|
+
- /a|b/ -> ^(a|b)
|
|
60
60
|
- /a*?/ -> error, could match ""
|
|
61
|
-
- /a*b/ -> ((?:b)?a*+)
|
|
62
|
-
- /.*?ab/ -> ((?:b)?a)
|
|
63
|
-
- /a.*?b/ -> ((?:b)?.*?a)
|
|
64
|
-
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a)
|
|
65
|
-
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
|
|
66
|
-
- /ab{2,4}c/ ->
|
|
61
|
+
- /a*b/ -> ^((?:b)?a*+) (final repetitions become eager)
|
|
62
|
+
- /.*?ab/ -> ^((?:b)?a) (omit .*)
|
|
63
|
+
- /a.*?b/ -> ^((?:b)?.*?a) (keep reluctant matches)
|
|
64
|
+
- /a(bc)d/ -> ^((?:(?:d)?(?:(?:c)?b))?a)
|
|
65
|
+
- /a(bc|de)/ -> ^((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a)
|
|
66
|
+
- /ab{2,4}c/ -> ^cbbb?b?a -> ^((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a)
|
|
67
67
|
|
|
68
|
-
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
|
|
69
|
-
|
|
68
|
+
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern.
|
|
69
|
+
All other groups are turned into non-capturing groups, and reluctant quantifiers are ignored.
|
|
70
70
|
*/
|
|
71
71
|
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
72
72
|
auto it = pattern.begin();
|
|
@@ -177,7 +177,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
|
177
177
|
}
|
|
178
178
|
}
|
|
179
179
|
|
|
180
|
-
// /abcd/ -> (dcba|cba|ba|a)
|
|
180
|
+
// /abcd/ -> ^(dcba|cba|ba|a) -> ^((?:(?:(?:d)?c)?b)?a)
|
|
181
181
|
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
|
|
182
182
|
// We'll do the outermost capturing group and final .* in the enclosing function.
|
|
183
183
|
std::vector<std::string> res_alts;
|
|
@@ -200,5 +200,5 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
|
200
200
|
throw std::runtime_error("Unmatched '(' in pattern");
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
return "(" + res + ")
|
|
203
|
+
return "^(" + res + ")";
|
|
204
204
|
}
|
|
@@ -120,17 +120,34 @@ struct common_sampler {
|
|
|
120
120
|
}
|
|
121
121
|
|
|
122
122
|
void set_logits(struct llama_context * ctx, int idx) {
|
|
123
|
-
const
|
|
123
|
+
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
|
|
124
|
+
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
|
|
125
|
+
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
|
|
124
126
|
|
|
125
127
|
const llama_model * model = llama_get_model(ctx);
|
|
126
128
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
127
129
|
|
|
128
130
|
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
129
131
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
132
|
+
if (sampled_probs) {
|
|
133
|
+
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
|
|
134
|
+
cur.resize(sampled_probs_count);
|
|
135
|
+
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
|
136
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
|
|
137
|
+
}
|
|
138
|
+
} else if (sampled_logits) {
|
|
139
|
+
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
|
|
140
|
+
cur.resize(sampled_logits_count);
|
|
141
|
+
for (uint32_t i = 0; i < sampled_logits_count; i++) {
|
|
142
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
|
|
143
|
+
}
|
|
144
|
+
} else {
|
|
145
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
146
|
+
GGML_ASSERT(logits != nullptr);
|
|
147
|
+
cur.resize(n_vocab);
|
|
148
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
149
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
150
|
+
}
|
|
134
151
|
}
|
|
135
152
|
|
|
136
153
|
cur_p = { cur.data(), cur.size(), -1, false };
|
|
@@ -159,7 +176,7 @@ std::string common_params_sampling::print() const {
|
|
|
159
176
|
return std::string(result);
|
|
160
177
|
}
|
|
161
178
|
|
|
162
|
-
struct common_sampler * common_sampler_init(const struct llama_model * model,
|
|
179
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
|
|
163
180
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
164
181
|
|
|
165
182
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
|
@@ -179,24 +196,30 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
179
196
|
#endif // LLAMA_USE_LLGUIDANCE
|
|
180
197
|
} else {
|
|
181
198
|
std::vector<std::string> trigger_patterns;
|
|
182
|
-
std::vector<std::string> patterns_anywhere;
|
|
183
199
|
std::vector<llama_token> trigger_tokens;
|
|
184
200
|
for (const auto & trigger : params.grammar_triggers) {
|
|
185
201
|
switch (trigger.type) {
|
|
186
202
|
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
|
187
203
|
{
|
|
188
204
|
const auto & word = trigger.value;
|
|
189
|
-
|
|
205
|
+
trigger_patterns.push_back(regex_escape(word));
|
|
190
206
|
break;
|
|
191
207
|
}
|
|
192
208
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
|
193
209
|
{
|
|
194
|
-
|
|
210
|
+
trigger_patterns.push_back(trigger.value);
|
|
195
211
|
break;
|
|
196
212
|
}
|
|
197
213
|
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
|
198
214
|
{
|
|
199
|
-
|
|
215
|
+
const auto & pattern = trigger.value;
|
|
216
|
+
std::string anchored = "^$";
|
|
217
|
+
if (!pattern.empty()) {
|
|
218
|
+
anchored = (pattern.front() != '^' ? "^" : "")
|
|
219
|
+
+ pattern
|
|
220
|
+
+ (pattern.back() != '$' ? "$" : "");
|
|
221
|
+
}
|
|
222
|
+
trigger_patterns.push_back(anchored);
|
|
200
223
|
break;
|
|
201
224
|
}
|
|
202
225
|
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
|
@@ -210,10 +233,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
210
233
|
}
|
|
211
234
|
}
|
|
212
235
|
|
|
213
|
-
if (!patterns_anywhere.empty()) {
|
|
214
|
-
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
|
215
|
-
}
|
|
216
|
-
|
|
217
236
|
std::vector<const char *> trigger_patterns_c;
|
|
218
237
|
trigger_patterns_c.reserve(trigger_patterns.size());
|
|
219
238
|
for (const auto & regex : trigger_patterns) {
|
|
@@ -296,6 +315,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
296
315
|
llama_sampler_chain_add(chain, smpl);
|
|
297
316
|
}
|
|
298
317
|
|
|
318
|
+
if (grmr && params.backend_sampling) {
|
|
319
|
+
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
|
|
320
|
+
|
|
321
|
+
params.backend_sampling = false;
|
|
322
|
+
}
|
|
323
|
+
|
|
299
324
|
auto * result = new common_sampler {
|
|
300
325
|
/* .params = */ params,
|
|
301
326
|
/* .grmr = */ grmr,
|
|
@@ -405,6 +430,25 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|
|
405
430
|
auto & chain = gsmpl->chain;
|
|
406
431
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
|
407
432
|
|
|
433
|
+
// Check if a backend sampler has already sampled a token in which case we
|
|
434
|
+
// return that token id directly.
|
|
435
|
+
{
|
|
436
|
+
id = llama_get_sampled_token_ith(ctx, idx);
|
|
437
|
+
|
|
438
|
+
if (id != LLAMA_TOKEN_NULL) {
|
|
439
|
+
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
|
440
|
+
|
|
441
|
+
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
|
442
|
+
|
|
443
|
+
// TODO: simplify
|
|
444
|
+
gsmpl->cur.resize(1);
|
|
445
|
+
gsmpl->cur[0] = { id, 0.0f, 1.0f };
|
|
446
|
+
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
|
|
447
|
+
|
|
448
|
+
return id;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
408
452
|
gsmpl->set_logits(ctx, idx);
|
|
409
453
|
|
|
410
454
|
if (grammar_first) {
|
|
@@ -36,7 +36,8 @@ struct common_sampler;
|
|
|
36
36
|
|
|
37
37
|
// llama_sampler API overloads
|
|
38
38
|
|
|
39
|
-
|
|
39
|
+
// note: can mutate params in some cases
|
|
40
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
|
|
40
41
|
|
|
41
42
|
void common_sampler_free(struct common_sampler * gsmpl);
|
|
42
43
|
|
|
@@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|
|
48
49
|
// arguments can be nullptr to skip printing
|
|
49
50
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
|
50
51
|
|
|
52
|
+
// get the underlying llama_sampler_chain
|
|
51
53
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
|
52
54
|
|
|
53
55
|
// extended sampling implementation:
|
|
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
|
|
4
4
|
### GGML Version
|
|
5
5
|
set(GGML_VERSION_MAJOR 0)
|
|
6
6
|
set(GGML_VERSION_MINOR 9)
|
|
7
|
-
set(GGML_VERSION_PATCH
|
|
7
|
+
set(GGML_VERSION_PATCH 5)
|
|
8
8
|
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
|
9
9
|
|
|
10
10
|
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
|
@@ -430,10 +430,22 @@ if (MSVC)
|
|
|
430
430
|
configure_msvc_target(ggml-cpu-x64)
|
|
431
431
|
configure_msvc_target(ggml-cpu-sse42)
|
|
432
432
|
configure_msvc_target(ggml-cpu-sandybridge)
|
|
433
|
+
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
434
|
+
# skipping ggml-cpu-ivybridge
|
|
435
|
+
# skipping ggml-cpu-piledriver
|
|
433
436
|
configure_msvc_target(ggml-cpu-haswell)
|
|
434
437
|
configure_msvc_target(ggml-cpu-skylakex)
|
|
438
|
+
configure_msvc_target(ggml-cpu-cannonlake)
|
|
439
|
+
configure_msvc_target(ggml-cpu-cascadelake)
|
|
435
440
|
configure_msvc_target(ggml-cpu-icelake)
|
|
441
|
+
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
|
442
|
+
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
|
443
|
+
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
|
444
|
+
# skipping ggml-cpu-cooperlake
|
|
445
|
+
# skipping ggml-cpu-zen4
|
|
436
446
|
configure_msvc_target(ggml-cpu-alderlake)
|
|
447
|
+
# MSVC doesn't support AMX
|
|
448
|
+
# skipping ggml-cpu-sapphirerapids
|
|
437
449
|
|
|
438
450
|
if (GGML_BUILD_EXAMPLES)
|
|
439
451
|
configure_msvc_target(common-ggml)
|
|
@@ -358,7 +358,7 @@ extern "C" {
|
|
|
358
358
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
|
359
359
|
|
|
360
360
|
// Compare the output of two backends
|
|
361
|
-
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor *
|
|
361
|
+
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
|
|
362
362
|
|
|
363
363
|
// Tensor initialization
|
|
364
364
|
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
357
357
|
endif()
|
|
358
358
|
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
|
359
359
|
ggml_add_cpu_backend_variant(x64)
|
|
360
|
-
ggml_add_cpu_backend_variant(sse42
|
|
361
|
-
ggml_add_cpu_backend_variant(sandybridge
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
360
|
+
ggml_add_cpu_backend_variant(sse42 SSE42)
|
|
361
|
+
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
|
362
|
+
if (NOT MSVC)
|
|
363
|
+
# __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
364
|
+
ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
|
|
365
|
+
ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
|
|
366
|
+
endif()
|
|
367
|
+
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
|
|
368
|
+
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
|
|
369
|
+
ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
|
|
370
|
+
ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
|
|
371
|
+
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
|
|
372
|
+
if (NOT MSVC)
|
|
373
|
+
# MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
|
|
374
|
+
# https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
|
|
375
|
+
# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
|
|
376
|
+
ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
|
|
377
|
+
ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
|
|
378
|
+
endif()
|
|
379
|
+
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
|
|
366
380
|
if (NOT MSVC)
|
|
367
381
|
# MSVC doesn't support AMX
|
|
368
|
-
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2
|
|
382
|
+
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
|
369
383
|
endif()
|
|
370
384
|
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
|
371
385
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
@@ -387,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
387
401
|
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
|
388
402
|
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
|
389
403
|
ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
|
|
390
|
-
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
|
|
391
|
-
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
404
|
+
ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
|
|
405
|
+
ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
|
|
392
406
|
elseif (APPLE)
|
|
393
407
|
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
|
394
408
|
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
|
@@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
561
561
|
|
|
562
562
|
# Fetch KleidiAI sources:
|
|
563
563
|
include(FetchContent)
|
|
564
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
564
|
+
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
|
|
565
565
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
566
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
566
|
+
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
|
|
567
567
|
|
|
568
568
|
if (POLICY CMP0135)
|
|
569
569
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
615
615
|
string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
|
|
616
616
|
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
|
617
617
|
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
|
618
|
+
string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
|
|
618
619
|
|
|
619
620
|
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
|
|
620
621
|
|
|
@@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
659
660
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
|
660
661
|
endif()
|
|
661
662
|
|
|
663
|
+
if (NOT SVE_ENABLED MATCHES -1)
|
|
664
|
+
list(APPEND GGML_KLEIDIAI_SOURCES
|
|
665
|
+
${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
|
|
666
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
|
|
667
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
|
|
668
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
|
|
669
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
|
|
670
|
+
endif()
|
|
671
|
+
|
|
662
672
|
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
|
663
673
|
list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
|
|
664
674
|
endif()
|
|
@@ -328,7 +328,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
|
328
328
|
|
|
329
329
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
330
330
|
#include <intrin.h>
|
|
331
|
-
#elif defined(
|
|
331
|
+
#elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
|
|
332
332
|
#include <immintrin.h>
|
|
333
333
|
#endif
|
|
334
334
|
|