@fugood/llama.node 1.2.0-rc.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -0
- package/src/llama.cpp/src/llama-arch.cpp +1 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +5 -3
- package/src/llama.cpp/src/llama-graph.cpp +3 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +58 -96
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama.cpp +53 -10
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.0
|
|
4
|
+
"version": "1.2.0",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.0
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.0
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.0
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.0
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.0
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.0
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.0
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.0
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.0
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.0
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.0
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.0
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.0
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.0",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.0",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.0",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.0",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.0",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.0",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.0",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.0",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.0",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.0",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.0",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.0",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.0"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -618,6 +618,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
618
618
|
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
|
619
619
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
|
620
620
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
621
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
|
|
621
622
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
622
623
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
623
624
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
@@ -685,11 +686,13 @@ static void parse_json_tool_calls(
|
|
|
685
686
|
size_t from = std::string::npos;
|
|
686
687
|
auto first = true;
|
|
687
688
|
while (true) {
|
|
689
|
+
auto start_pos = builder.pos();
|
|
688
690
|
auto res = function_regex_start_only && first
|
|
689
691
|
? builder.try_consume_regex(*function_regex_start_only)
|
|
690
692
|
: function_regex
|
|
691
693
|
? builder.try_find_regex(*function_regex, from)
|
|
692
694
|
: std::nullopt;
|
|
695
|
+
|
|
693
696
|
if (res) {
|
|
694
697
|
std::string name;
|
|
695
698
|
if (get_function_name) {
|
|
@@ -724,6 +727,8 @@ static void parse_json_tool_calls(
|
|
|
724
727
|
return;
|
|
725
728
|
}
|
|
726
729
|
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
730
|
+
} else {
|
|
731
|
+
builder.move_to(start_pos);
|
|
727
732
|
}
|
|
728
733
|
break;
|
|
729
734
|
}
|
|
@@ -1374,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
|
|
1374
1379
|
}
|
|
1375
1380
|
return data;
|
|
1376
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1384
|
+
common_chat_params data;
|
|
1385
|
+
|
|
1386
|
+
// Pass thinking context for DeepSeek V3.1 template
|
|
1387
|
+
json additional_context = {
|
|
1388
|
+
{"thinking", inputs.enable_thinking},
|
|
1389
|
+
};
|
|
1390
|
+
|
|
1391
|
+
auto prompt = apply(tmpl, inputs,
|
|
1392
|
+
/* messages_override= */ inputs.messages,
|
|
1393
|
+
/* tools_override= */ std::nullopt,
|
|
1394
|
+
additional_context);
|
|
1395
|
+
data.prompt = prompt;
|
|
1396
|
+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
|
|
1397
|
+
if (string_ends_with(data.prompt, "<think>")) {
|
|
1398
|
+
if (!inputs.enable_thinking) {
|
|
1399
|
+
data.prompt += "</think>";
|
|
1400
|
+
} else {
|
|
1401
|
+
data.thinking_forced_open = true;
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1405
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
|
1406
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1407
|
+
std::vector<std::string> tool_rules;
|
|
1408
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1409
|
+
const auto & function = tool.at("function");
|
|
1410
|
+
std::string name = function.at("name");
|
|
1411
|
+
auto parameters = function.at("parameters");
|
|
1412
|
+
builder.resolve_refs(parameters);
|
|
1413
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
1414
|
+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
|
|
1415
|
+
"\" " + builder.add_schema(name + "-args", parameters) + " "
|
|
1416
|
+
"\"<|tool▁call▁end|>\""));
|
|
1417
|
+
});
|
|
1418
|
+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
|
1419
|
+
// so we accept common variants (then it's all constrained)
|
|
1420
|
+
builder.add_rule("root",
|
|
1421
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1422
|
+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
|
|
1423
|
+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
|
1424
|
+
"\"<|tool▁calls▁end|>\""
|
|
1425
|
+
" space");
|
|
1426
|
+
data.grammar_triggers.push_back({
|
|
1427
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1428
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1429
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1430
|
+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1431
|
+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
|
|
1432
|
+
});
|
|
1433
|
+
data.preserved_tokens = {
|
|
1434
|
+
"<think>",
|
|
1435
|
+
"</think>",
|
|
1436
|
+
"<|tool▁calls▁begin|>",
|
|
1437
|
+
"<|tool▁call▁begin|>",
|
|
1438
|
+
"<|tool▁sep|>",
|
|
1439
|
+
"<|tool▁call▁end|>",
|
|
1440
|
+
"<|tool▁calls▁end|>",
|
|
1441
|
+
};
|
|
1442
|
+
});
|
|
1443
|
+
}
|
|
1444
|
+
return data;
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1377
1447
|
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
1378
1448
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1379
1449
|
if (!builder.syntax().parse_tool_calls) {
|
|
@@ -1395,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1395
1465
|
tool_calls_end);
|
|
1396
1466
|
}
|
|
1397
1467
|
|
|
1468
|
+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
|
|
1469
|
+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
|
|
1470
|
+
|
|
1471
|
+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
|
|
1472
|
+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
|
|
1473
|
+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
|
|
1474
|
+
|
|
1475
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1476
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1477
|
+
builder.add_content(builder.consume_rest());
|
|
1478
|
+
return;
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1482
|
+
|
|
1483
|
+
parse_json_tool_calls(
|
|
1484
|
+
builder,
|
|
1485
|
+
/* block_open= */ tool_calls_begin,
|
|
1486
|
+
/* function_regex_start_only= */ std::nullopt,
|
|
1487
|
+
function_regex,
|
|
1488
|
+
close_regex,
|
|
1489
|
+
tool_calls_end);
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
|
|
1493
|
+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1494
|
+
// First try to parse using the standard reasoning parsing method
|
|
1495
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1496
|
+
|
|
1497
|
+
auto start_pos = builder.pos();
|
|
1498
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1499
|
+
builder.move_to(start_pos);
|
|
1500
|
+
|
|
1501
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1502
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1504
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1505
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1506
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1507
|
+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
|
|
1508
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1509
|
+
} else {
|
|
1510
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1511
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1512
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1513
|
+
return;
|
|
1514
|
+
}
|
|
1515
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1516
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1517
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1518
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1519
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1520
|
+
} else {
|
|
1521
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1522
|
+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
|
|
1523
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1398
1528
|
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1399
1529
|
common_chat_params data;
|
|
1400
1530
|
auto prompt = apply(tmpl, inputs);
|
|
@@ -2351,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2351
2481
|
}
|
|
2352
2482
|
}
|
|
2353
2483
|
|
|
2484
|
+
// DeepSeek V3.1: detect based on specific patterns in the template
|
|
2485
|
+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
|
|
2486
|
+
params.json_schema.is_null()) {
|
|
2487
|
+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2354
2490
|
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
|
|
2355
2491
|
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
|
|
2356
2492
|
return common_chat_params_init_deepseek_r1(tmpl, params);
|
|
@@ -2523,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2523
2659
|
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
|
2524
2660
|
common_chat_parse_deepseek_r1(builder);
|
|
2525
2661
|
break;
|
|
2662
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
|
|
2663
|
+
common_chat_parse_deepseek_v3_1(builder);
|
|
2664
|
+
break;
|
|
2526
2665
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
|
2527
2666
|
common_chat_parse_functionary_v3_2(builder);
|
|
2528
2667
|
break;
|
|
@@ -118,6 +118,7 @@ enum common_chat_format {
|
|
|
118
118
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
119
119
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
120
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
121
|
+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
|
121
122
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
122
123
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
124
|
COMMON_CHAT_FORMAT_GRANITE,
|
|
@@ -843,9 +843,10 @@ public:
|
|
|
843
843
|
_build_object_rule(
|
|
844
844
|
properties, required, name,
|
|
845
845
|
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
|
846
|
-
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
|
|
846
|
+
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
|
847
847
|
std::unordered_set<std::string> required;
|
|
848
848
|
std::vector<std::pair<std::string, json>> properties;
|
|
849
|
+
std::map<std::string, size_t> enum_values;
|
|
849
850
|
std::string hybrid_name = name;
|
|
850
851
|
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
|
851
852
|
if (comp_schema.contains("$ref")) {
|
|
@@ -857,6 +858,14 @@ public:
|
|
|
857
858
|
required.insert(prop.key());
|
|
858
859
|
}
|
|
859
860
|
}
|
|
861
|
+
} else if (comp_schema.contains("enum")) {
|
|
862
|
+
for (const auto & v : comp_schema["enum"]) {
|
|
863
|
+
const auto rule = _generate_constant_rule(v);
|
|
864
|
+
if (enum_values.find(rule) == enum_values.end()) {
|
|
865
|
+
enum_values[rule] = 0;
|
|
866
|
+
}
|
|
867
|
+
enum_values[rule] += 1;
|
|
868
|
+
}
|
|
860
869
|
} else {
|
|
861
870
|
// todo warning
|
|
862
871
|
}
|
|
@@ -870,6 +879,17 @@ public:
|
|
|
870
879
|
add_component(t, true);
|
|
871
880
|
}
|
|
872
881
|
}
|
|
882
|
+
if (!enum_values.empty()) {
|
|
883
|
+
std::vector<std::string> enum_intersection;
|
|
884
|
+
for (const auto & p : enum_values) {
|
|
885
|
+
if (p.second == schema["allOf"].size()) {
|
|
886
|
+
enum_intersection.push_back(p.first);
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
if (!enum_intersection.empty()) {
|
|
890
|
+
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
|
891
|
+
}
|
|
892
|
+
}
|
|
873
893
|
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
|
874
894
|
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
|
875
895
|
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
|
@@ -132,6 +132,8 @@ extern "C" {
|
|
|
132
132
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
133
|
// GPU device using dedicated memory
|
|
134
134
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
135
|
+
// integrated GPU device using host memory
|
|
136
|
+
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
|
135
137
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
138
|
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
137
139
|
};
|
|
@@ -150,11 +152,21 @@ extern "C" {
|
|
|
150
152
|
|
|
151
153
|
// all the device properties
|
|
152
154
|
struct ggml_backend_dev_props {
|
|
155
|
+
// device name
|
|
153
156
|
const char * name;
|
|
157
|
+
// device description
|
|
154
158
|
const char * description;
|
|
159
|
+
// device free memory in bytes
|
|
155
160
|
size_t memory_free;
|
|
161
|
+
// device total memory in bytes
|
|
156
162
|
size_t memory_total;
|
|
163
|
+
// device type
|
|
157
164
|
enum ggml_backend_dev_type type;
|
|
165
|
+
// device id
|
|
166
|
+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
|
167
|
+
// if the id is unknown, this should be NULL
|
|
168
|
+
const char * device_id;
|
|
169
|
+
// device capabilities
|
|
158
170
|
struct ggml_backend_dev_caps caps;
|
|
159
171
|
};
|
|
160
172
|
|
|
@@ -134,6 +134,7 @@ extern "C" {
|
|
|
134
134
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
135
135
|
|
|
136
136
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
|
|
137
138
|
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
138
139
|
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
139
140
|
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
|
@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
|
|
43
43
|
|
|
44
44
|
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|
45
45
|
|
|
46
|
-
GGML_DEPRECATED(
|
|
47
|
-
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
|
48
|
-
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
|
49
|
-
|
|
50
46
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
|
51
47
|
|
|
52
|
-
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
|
53
|
-
|
|
54
48
|
// helper to check if the device supports a specific family
|
|
55
49
|
// ideally, the user code should be doing these checks
|
|
56
50
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
|
@@ -1404,6 +1404,7 @@ extern "C" {
|
|
|
1404
1404
|
struct ggml_tensor * a,
|
|
1405
1405
|
struct ggml_tensor * b);
|
|
1406
1406
|
|
|
1407
|
+
// note: casting from f32 to i32 will discard the fractional part
|
|
1407
1408
|
GGML_API struct ggml_tensor * ggml_cast(
|
|
1408
1409
|
struct ggml_context * ctx,
|
|
1409
1410
|
struct ggml_tensor * a,
|
|
@@ -1528,7 +1529,11 @@ extern "C" {
|
|
|
1528
1529
|
struct ggml_context * ctx,
|
|
1529
1530
|
struct ggml_tensor * a);
|
|
1530
1531
|
|
|
1531
|
-
// supports
|
|
1532
|
+
// supports 4D a:
|
|
1533
|
+
// a [n_embd, ne1, ne2, ne3]
|
|
1534
|
+
// b I32 [n_rows, ne2, ne3, 1]
|
|
1535
|
+
//
|
|
1536
|
+
// return [n_embd, n_rows, ne2, ne3]
|
|
1532
1537
|
GGML_API struct ggml_tensor * ggml_get_rows(
|
|
1533
1538
|
struct ggml_context * ctx,
|
|
1534
1539
|
struct ggml_tensor * a, // data
|
|
@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
224
224
|
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
|
|
225
225
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
|
226
226
|
if (NOT ${feature_pos} EQUAL -1)
|
|
227
|
-
|
|
227
|
+
# Special handling for MATMUL_INT8 when machine doesn't support i8mm
|
|
228
|
+
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
|
|
229
|
+
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
|
|
230
|
+
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
|
|
231
|
+
else()
|
|
232
|
+
message(STATUS "ARM feature ${feature} enabled")
|
|
233
|
+
endif()
|
|
228
234
|
endif()
|
|
229
235
|
endforeach()
|
|
230
236
|
endif()
|
|
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
53
53
|
|
|
54
54
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
55
55
|
for (int i = 0; i < nb; i++) {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
float32x4_t srcv [8];
|
|
57
|
+
float32x4_t asrcv[8];
|
|
58
|
+
float32x4_t amaxv[8];
|
|
59
59
|
|
|
60
60
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
61
61
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
74
74
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
75
75
|
|
|
76
76
|
for (int j = 0; j < 8; j++) {
|
|
77
|
-
const
|
|
78
|
-
const
|
|
77
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
78
|
+
const int32x4_t vi = vec_signed(v);
|
|
79
79
|
|
|
80
80
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
81
81
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
98
98
|
|
|
99
99
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
100
100
|
for (int i = 0; i < nb; i++) {
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
101
|
+
float32x4_t srcv [8];
|
|
102
|
+
float32x4_t asrcv[8];
|
|
103
|
+
float32x4_t amaxv[8];
|
|
104
104
|
|
|
105
105
|
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
|
106
106
|
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
|
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|
|
118
118
|
|
|
119
119
|
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
|
120
120
|
|
|
121
|
-
|
|
121
|
+
int32x4_t acc = vec_splats(0);
|
|
122
122
|
|
|
123
123
|
for (int j = 0; j < 8; j++) {
|
|
124
|
-
const
|
|
125
|
-
const
|
|
124
|
+
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
|
|
125
|
+
const int32x4_t vi = vec_signed(v);
|
|
126
126
|
|
|
127
127
|
y[i].qs[4*j + 0] = vec_extract(vi, 0);
|
|
128
128
|
y[i].qs[4*j + 1] = vec_extract(vi, 1);
|
|
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
162
162
|
float sumf = 0;
|
|
163
163
|
|
|
164
164
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
165
|
-
|
|
165
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
166
166
|
|
|
167
|
-
const
|
|
168
|
-
const
|
|
167
|
+
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
|
|
168
|
+
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
|
|
169
169
|
|
|
170
170
|
for (; ib < nb; ++ib) {
|
|
171
|
-
const
|
|
172
|
-
const
|
|
173
|
-
const
|
|
171
|
+
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
|
172
|
+
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
|
173
|
+
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
|
|
174
174
|
|
|
175
|
-
const
|
|
176
|
-
const
|
|
175
|
+
const int8x16_t v_xls = vec_sub(v_xl, v_s);
|
|
176
|
+
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
|
|
177
177
|
|
|
178
|
-
const
|
|
179
|
-
const
|
|
178
|
+
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
|
|
179
|
+
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
|
|
180
180
|
|
|
181
|
-
const
|
|
182
|
-
const
|
|
183
|
-
const
|
|
184
|
-
const
|
|
181
|
+
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
|
|
182
|
+
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
|
|
183
|
+
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
|
|
184
|
+
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
|
|
185
185
|
|
|
186
|
-
|
|
186
|
+
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
|
187
187
|
|
|
188
|
-
const
|
|
189
|
-
const
|
|
188
|
+
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
|
|
189
|
+
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
190
190
|
|
|
191
191
|
acc = vec_madd(v_xy, v_d, acc);
|
|
192
192
|
}
|
|
193
193
|
|
|
194
|
-
sumf = acc
|
|
195
|
-
|
|
194
|
+
sumf = vec_hsum_f32x4(acc);
|
|
196
195
|
*s = sumf;
|
|
197
196
|
#else
|
|
198
197
|
UNUSED(nb);
|
|
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
249
248
|
acc = vec_madd(v_xy, v_d, acc);
|
|
250
249
|
}
|
|
251
250
|
|
|
252
|
-
sumf = acc
|
|
253
|
-
|
|
251
|
+
sumf = vec_hsum_f32x4(acc) + summs;
|
|
254
252
|
*s = sumf;
|
|
255
253
|
#else
|
|
256
254
|
UNUSED(nb);
|
|
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
351
349
|
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
352
350
|
}
|
|
353
351
|
|
|
354
|
-
sumf +=
|
|
352
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
|
|
355
353
|
|
|
356
354
|
#pragma GCC unroll 4
|
|
357
355
|
for (; ib < nb; ++ib) {
|
|
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
390
388
|
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
391
389
|
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
|
|
392
390
|
|
|
393
|
-
sumf +=
|
|
391
|
+
sumf += vec_hsum_f32x4(v_acc);
|
|
394
392
|
}
|
|
395
393
|
|
|
396
394
|
*s = sumf;
|
|
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
502
500
|
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
|
|
503
501
|
}
|
|
504
502
|
|
|
505
|
-
sumf +=
|
|
503
|
+
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
|
|
506
504
|
|
|
507
505
|
#pragma GCC unroll 4
|
|
508
506
|
for (; ib < nb; ++ib) {
|
|
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
543
541
|
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
|
|
544
542
|
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
|
|
545
543
|
|
|
546
|
-
sumf +=
|
|
544
|
+
sumf += vec_hsum_f32x4(v_acc) + summs;
|
|
547
545
|
}
|
|
548
546
|
|
|
549
547
|
*s = sumf;
|
|
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
575
573
|
float sumf = 0;
|
|
576
574
|
|
|
577
575
|
#if defined(__VXE__) || defined(__VXE2__)
|
|
578
|
-
|
|
576
|
+
float32x4_t acc = vec_splats(0.0f);
|
|
579
577
|
|
|
580
578
|
#pragma GCC unroll 8
|
|
581
579
|
for (; ib < nb; ++ib) {
|
|
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
594
592
|
acc = vec_madd(v_xy, v_d, acc);
|
|
595
593
|
}
|
|
596
594
|
|
|
597
|
-
sumf = acc
|
|
595
|
+
sumf = vec_hsum_f32x4(acc);
|
|
598
596
|
|
|
599
597
|
*s = sumf;
|
|
600
598
|
#else
|
|
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
718
716
|
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
|
719
717
|
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
|
720
718
|
|
|
721
|
-
isum += (isum0
|
|
722
|
-
isum += (isum1
|
|
723
|
-
isum += (isum2
|
|
724
|
-
isum += (isum3
|
|
719
|
+
isum += vec_hsum_i32x4(isum0) * scale[0];
|
|
720
|
+
isum += vec_hsum_i32x4(isum1) * scale[1];
|
|
721
|
+
isum += vec_hsum_i32x4(isum2) * scale[2];
|
|
722
|
+
isum += vec_hsum_i32x4(isum3) * scale[3];
|
|
725
723
|
|
|
726
724
|
scale += 4;
|
|
727
725
|
|
|
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
819
817
|
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
|
|
820
818
|
|
|
821
819
|
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
822
|
-
sumi1 += (p1
|
|
820
|
+
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
|
|
823
821
|
|
|
824
822
|
v_y[0] = vec_xl(0 , y0);
|
|
825
823
|
v_y[1] = vec_xl(16, y0);
|
|
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
829
827
|
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
|
|
830
828
|
|
|
831
829
|
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
|
|
832
|
-
sumi2 += (p2
|
|
830
|
+
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
|
|
833
831
|
}
|
|
834
832
|
|
|
835
833
|
sumf += d * (sumi1 + sumi2);
|
|
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
911
909
|
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
|
|
912
910
|
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
|
|
913
911
|
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
|
|
914
|
-
const int32_t mins = v_mins
|
|
912
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
|
915
913
|
|
|
916
914
|
const uint8_t * scales = (const uint8_t *)utmp;
|
|
917
915
|
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
|
|
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
948
946
|
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
|
|
949
947
|
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
|
|
950
948
|
|
|
951
|
-
sumi += (sumi0
|
|
952
|
-
sumi += (sumi1
|
|
949
|
+
sumi += vec_hsum_i32x4(sumi0) * *scales++;
|
|
950
|
+
sumi += vec_hsum_i32x4(sumi1) * *scales++;
|
|
953
951
|
}
|
|
954
952
|
|
|
955
953
|
sumf += d * sumi - dmin * mins;
|
|
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1020
1018
|
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
|
|
1021
1019
|
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
|
|
1022
1020
|
|
|
1023
|
-
const int32_t mins = v_mins
|
|
1021
|
+
const int32_t mins = vec_hsum_i32x4(v_mins);
|
|
1024
1022
|
|
|
1025
1023
|
int32_t isum = 0;
|
|
1026
1024
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1060
1058
|
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
1061
1059
|
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
1062
1060
|
|
|
1063
|
-
isum += (summs0
|
|
1064
|
-
(summs1
|
|
1065
|
-
(summs2
|
|
1066
|
-
(summs3
|
|
1061
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
|
1062
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
|
1063
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
|
1064
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
|
1067
1065
|
|
|
1068
1066
|
scale += 4;
|
|
1069
1067
|
|
|
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1094
1092
|
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
|
|
1095
1093
|
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
|
|
1096
1094
|
|
|
1097
|
-
isum += (summs0
|
|
1098
|
-
(summs1
|
|
1099
|
-
(summs2
|
|
1100
|
-
(summs3
|
|
1095
|
+
isum += vec_hsum_i32x4(summs0) * scale[0] +
|
|
1096
|
+
vec_hsum_i32x4(summs1) * scale[1] +
|
|
1097
|
+
vec_hsum_i32x4(summs2) * scale[2] +
|
|
1098
|
+
vec_hsum_i32x4(summs3) * scale[3];
|
|
1101
1099
|
|
|
1102
1100
|
scale += 4;
|
|
1103
1101
|
}
|
|
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1285
1283
|
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
|
1286
1284
|
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
|
1287
1285
|
|
|
1288
|
-
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy
|
|
1286
|
+
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
|
|
1289
1287
|
}
|
|
1290
1288
|
|
|
1291
1289
|
*s = sumf;
|
|
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1354
1352
|
|
|
1355
1353
|
h >>= 4;
|
|
1356
1354
|
|
|
1357
|
-
sumi1 += (vsumi0
|
|
1358
|
-
sumi2 += (vsumi1
|
|
1355
|
+
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
|
|
1356
|
+
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
|
|
1359
1357
|
}
|
|
1360
1358
|
|
|
1361
1359
|
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|