@fugood/llama.node 1.2.0-rc.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.0-rc.0",
4
+ "version": "1.2.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.0-rc.0",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.0-rc.0",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.0-rc.0",
78
- "@fugood/node-llama-linux-arm64": "1.2.0-rc.0",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.0-rc.0",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.0-rc.0",
81
- "@fugood/node-llama-win32-x64": "1.2.0-rc.0",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.0-rc.0",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.0-rc.0",
84
- "@fugood/node-llama-win32-arm64": "1.2.0-rc.0",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.0-rc.0",
86
- "@fugood/node-llama-darwin-x64": "1.2.0-rc.0",
87
- "@fugood/node-llama-darwin-arm64": "1.2.0-rc.0"
75
+ "@fugood/node-llama-linux-x64": "1.2.0",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.0",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.0",
78
+ "@fugood/node-llama-linux-arm64": "1.2.0",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.0",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.0",
81
+ "@fugood/node-llama-win32-x64": "1.2.0",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.0",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.0",
84
+ "@fugood/node-llama-win32-arm64": "1.2.0",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.0",
86
+ "@fugood/node-llama-darwin-x64": "1.2.0",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.0"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -618,6 +618,7 @@ const char * common_chat_format_name(common_chat_format format) {
618
618
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
619
619
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
620
620
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
621
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
621
622
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
622
623
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
623
624
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -685,11 +686,13 @@ static void parse_json_tool_calls(
685
686
  size_t from = std::string::npos;
686
687
  auto first = true;
687
688
  while (true) {
689
+ auto start_pos = builder.pos();
688
690
  auto res = function_regex_start_only && first
689
691
  ? builder.try_consume_regex(*function_regex_start_only)
690
692
  : function_regex
691
693
  ? builder.try_find_regex(*function_regex, from)
692
694
  : std::nullopt;
695
+
693
696
  if (res) {
694
697
  std::string name;
695
698
  if (get_function_name) {
@@ -724,6 +727,8 @@ static void parse_json_tool_calls(
724
727
  return;
725
728
  }
726
729
  throw common_chat_msg_partial_exception("incomplete tool call");
730
+ } else {
731
+ builder.move_to(start_pos);
727
732
  }
728
733
  break;
729
734
  }
@@ -1374,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
1374
1379
  }
1375
1380
  return data;
1376
1381
  }
1382
+
1383
+ static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1384
+ common_chat_params data;
1385
+
1386
+ // Pass thinking context for DeepSeek V3.1 template
1387
+ json additional_context = {
1388
+ {"thinking", inputs.enable_thinking},
1389
+ };
1390
+
1391
+ auto prompt = apply(tmpl, inputs,
1392
+ /* messages_override= */ inputs.messages,
1393
+ /* tools_override= */ std::nullopt,
1394
+ additional_context);
1395
+ data.prompt = prompt;
1396
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1397
+ if (string_ends_with(data.prompt, "<think>")) {
1398
+ if (!inputs.enable_thinking) {
1399
+ data.prompt += "</think>";
1400
+ } else {
1401
+ data.thinking_forced_open = true;
1402
+ }
1403
+ }
1404
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
1405
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1406
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1407
+ std::vector<std::string> tool_rules;
1408
+ foreach_function(inputs.tools, [&](const json & tool) {
1409
+ const auto & function = tool.at("function");
1410
+ std::string name = function.at("name");
1411
+ auto parameters = function.at("parameters");
1412
+ builder.resolve_refs(parameters);
1413
+ tool_rules.push_back(builder.add_rule(name + "-call",
1414
+ "( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1415
+ "\" " + builder.add_schema(name + "-args", parameters) + " "
1416
+ "\"<|tool▁call▁end|>\""));
1417
+ });
1418
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1419
+ // so we accept common variants (then it's all constrained)
1420
+ builder.add_rule("root",
1421
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1422
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1423
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1424
+ "\"<|tool▁calls▁end|>\""
1425
+ " space");
1426
+ data.grammar_triggers.push_back({
1427
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1428
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1429
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1430
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1431
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1432
+ });
1433
+ data.preserved_tokens = {
1434
+ "<think>",
1435
+ "</think>",
1436
+ "<|tool▁calls▁begin|>",
1437
+ "<|tool▁call▁begin|>",
1438
+ "<|tool▁sep|>",
1439
+ "<|tool▁call▁end|>",
1440
+ "<|tool▁calls▁end|>",
1441
+ };
1442
+ });
1443
+ }
1444
+ return data;
1445
+ }
1446
+
1377
1447
  static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1378
1448
  builder.try_parse_reasoning("<think>", "</think>");
1379
1449
  if (!builder.syntax().parse_tool_calls) {
@@ -1395,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1395
1465
  tool_calls_end);
1396
1466
  }
1397
1467
 
1468
+ static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1469
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1470
+
1471
+ static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1472
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1473
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1474
+
1475
+ if (!builder.syntax().parse_tool_calls) {
1476
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1477
+ builder.add_content(builder.consume_rest());
1478
+ return;
1479
+ }
1480
+
1481
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1482
+
1483
+ parse_json_tool_calls(
1484
+ builder,
1485
+ /* block_open= */ tool_calls_begin,
1486
+ /* function_regex_start_only= */ std::nullopt,
1487
+ function_regex,
1488
+ close_regex,
1489
+ tool_calls_end);
1490
+ }
1491
+
1492
+ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1493
+ // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1494
+ // First try to parse using the standard reasoning parsing method
1495
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1496
+
1497
+ auto start_pos = builder.pos();
1498
+ auto found_end_think = builder.try_find_literal("</think>");
1499
+ builder.move_to(start_pos);
1500
+
1501
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1502
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1503
+ common_chat_parse_deepseek_v3_1_content(builder);
1504
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1505
+ // If reasoning was parsed successfully, the remaining content is regular content
1506
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1507
+ // </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1508
+ common_chat_parse_deepseek_v3_1_content(builder);
1509
+ } else {
1510
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1511
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1512
+ common_chat_parse_deepseek_v3_1_content(builder);
1513
+ return;
1514
+ }
1515
+ // If no reasoning tags found, check if we should treat everything as reasoning
1516
+ if (builder.syntax().thinking_forced_open) {
1517
+ // If thinking is forced open but no tags found, treat everything as reasoning
1518
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1519
+ builder.add_reasoning_content(builder.consume_rest());
1520
+ } else {
1521
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1522
+ // <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1523
+ common_chat_parse_deepseek_v3_1_content(builder);
1524
+ }
1525
+ }
1526
+ }
1527
+
1398
1528
  static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1399
1529
  common_chat_params data;
1400
1530
  auto prompt = apply(tmpl, inputs);
@@ -2351,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
2351
2481
  }
2352
2482
  }
2353
2483
 
2484
+ // DeepSeek V3.1: detect based on specific patterns in the template
2485
+ if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2486
+ params.json_schema.is_null()) {
2487
+ return common_chat_params_init_deepseek_v3_1(tmpl, params);
2488
+ }
2489
+
2354
2490
  // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
2355
2491
  if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
2356
2492
  return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2523,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2523
2659
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
2524
2660
  common_chat_parse_deepseek_r1(builder);
2525
2661
  break;
2662
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2663
+ common_chat_parse_deepseek_v3_1(builder);
2664
+ break;
2526
2665
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
2527
2666
  common_chat_parse_functionary_v3_2(builder);
2528
2667
  break;
@@ -118,6 +118,7 @@ enum common_chat_format {
118
118
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
119
119
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
120
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
121
+ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
121
122
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
122
123
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
124
  COMMON_CHAT_FORMAT_GRANITE,
@@ -843,9 +843,10 @@ public:
843
843
  _build_object_rule(
844
844
  properties, required, name,
845
845
  schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
846
- } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
846
+ } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
847
847
  std::unordered_set<std::string> required;
848
848
  std::vector<std::pair<std::string, json>> properties;
849
+ std::map<std::string, size_t> enum_values;
849
850
  std::string hybrid_name = name;
850
851
  std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
851
852
  if (comp_schema.contains("$ref")) {
@@ -857,6 +858,14 @@ public:
857
858
  required.insert(prop.key());
858
859
  }
859
860
  }
861
+ } else if (comp_schema.contains("enum")) {
862
+ for (const auto & v : comp_schema["enum"]) {
863
+ const auto rule = _generate_constant_rule(v);
864
+ if (enum_values.find(rule) == enum_values.end()) {
865
+ enum_values[rule] = 0;
866
+ }
867
+ enum_values[rule] += 1;
868
+ }
860
869
  } else {
861
870
  // todo warning
862
871
  }
@@ -870,6 +879,17 @@ public:
870
879
  add_component(t, true);
871
880
  }
872
881
  }
882
+ if (!enum_values.empty()) {
883
+ std::vector<std::string> enum_intersection;
884
+ for (const auto & p : enum_values) {
885
+ if (p.second == schema["allOf"].size()) {
886
+ enum_intersection.push_back(p.first);
887
+ }
888
+ }
889
+ if (!enum_intersection.empty()) {
890
+ return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
891
+ }
892
+ }
873
893
  return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
874
894
  } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
875
895
  json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
@@ -132,6 +132,8 @@ extern "C" {
132
132
  GGML_BACKEND_DEVICE_TYPE_CPU,
133
133
  // GPU device using dedicated memory
134
134
  GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // integrated GPU device using host memory
136
+ GGML_BACKEND_DEVICE_TYPE_IGPU,
135
137
  // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
138
  GGML_BACKEND_DEVICE_TYPE_ACCEL
137
139
  };
@@ -150,11 +152,21 @@ extern "C" {
150
152
 
151
153
  // all the device properties
152
154
  struct ggml_backend_dev_props {
155
+ // device name
153
156
  const char * name;
157
+ // device description
154
158
  const char * description;
159
+ // device free memory in bytes
155
160
  size_t memory_free;
161
+ // device total memory in bytes
156
162
  size_t memory_total;
163
+ // device type
157
164
  enum ggml_backend_dev_type type;
165
+ // device id
166
+ // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167
+ // if the id is unknown, this should be NULL
168
+ const char * device_id;
169
+ // device capabilities
158
170
  struct ggml_backend_dev_caps caps;
159
171
  };
160
172
 
@@ -134,6 +134,7 @@ extern "C" {
134
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
135
 
136
136
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
137
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
138
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
139
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
43
43
 
44
44
  GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
45
45
 
46
- GGML_DEPRECATED(
47
- GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
48
- "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
49
-
50
46
  GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
51
47
 
52
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
53
-
54
48
  // helper to check if the device supports a specific family
55
49
  // ideally, the user code should be doing these checks
56
50
  // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
@@ -1404,6 +1404,7 @@ extern "C" {
1404
1404
  struct ggml_tensor * a,
1405
1405
  struct ggml_tensor * b);
1406
1406
 
1407
+ // note: casting from f32 to i32 will discard the fractional part
1407
1408
  GGML_API struct ggml_tensor * ggml_cast(
1408
1409
  struct ggml_context * ctx,
1409
1410
  struct ggml_tensor * a,
@@ -1528,7 +1529,11 @@ extern "C" {
1528
1529
  struct ggml_context * ctx,
1529
1530
  struct ggml_tensor * a);
1530
1531
 
1531
- // supports 3D: a->ne[2] == b->ne[1]
1532
+ // supports 4D a:
1533
+ // a [n_embd, ne1, ne2, ne3]
1534
+ // b I32 [n_rows, ne2, ne3, 1]
1535
+ //
1536
+ // return [n_embd, n_rows, ne2, ne3]
1532
1537
  GGML_API struct ggml_tensor * ggml_get_rows(
1533
1538
  struct ggml_context * ctx,
1534
1539
  struct ggml_tensor * a, // data
@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
224
224
  foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
225
225
  string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
226
226
  if (NOT ${feature_pos} EQUAL -1)
227
- message(STATUS "ARM feature ${feature} enabled")
227
+ # Special handling for MATMUL_INT8 when machine doesn't support i8mm
228
+ if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
229
+ message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
230
+ list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
231
+ else()
232
+ message(STATUS "ARM feature ${feature} enabled")
233
+ endif()
228
234
  endif()
229
235
  endforeach()
230
236
  endif()
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
53
53
 
54
54
  #if defined(__VXE__) || defined(__VXE2__)
55
55
  for (int i = 0; i < nb; i++) {
56
- __vector float srcv [8];
57
- __vector float asrcv[8];
58
- __vector float amaxv[8];
56
+ float32x4_t srcv [8];
57
+ float32x4_t asrcv[8];
58
+ float32x4_t amaxv[8];
59
59
 
60
60
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
61
61
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
74
74
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
75
75
 
76
76
  for (int j = 0; j < 8; j++) {
77
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
78
- const __vector int32_t vi = vec_signed(v);
77
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
+ const int32x4_t vi = vec_signed(v);
79
79
 
80
80
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
81
81
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
98
98
 
99
99
  #if defined(__VXE__) || defined(__VXE2__)
100
100
  for (int i = 0; i < nb; i++) {
101
- __vector float srcv [8];
102
- __vector float asrcv[8];
103
- __vector float amaxv[8];
101
+ float32x4_t srcv [8];
102
+ float32x4_t asrcv[8];
103
+ float32x4_t amaxv[8];
104
104
 
105
105
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
106
106
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
118
118
 
119
119
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
120
120
 
121
- __vector int32_t acc = vec_splats(0);
121
+ int32x4_t acc = vec_splats(0);
122
122
 
123
123
  for (int j = 0; j < 8; j++) {
124
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
125
- const __vector int32_t vi = vec_signed(v);
124
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125
+ const int32x4_t vi = vec_signed(v);
126
126
 
127
127
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
128
128
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
162
162
  float sumf = 0;
163
163
 
164
164
  #if defined(__VXE__) || defined(__VXE2__)
165
- __vector float acc = vec_splats(0.0f);
165
+ float32x4_t acc = vec_splats(0.0f);
166
166
 
167
- const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
168
- const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
167
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
168
+ const int8x16_t v_s = vec_splats( (const int8_t)0x08);
169
169
 
170
170
  for (; ib < nb; ++ib) {
171
- const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
172
- const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
173
- const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
171
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
172
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
173
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
174
174
 
175
- const __vector int8_t v_xls = vec_sub(v_xl, v_s);
176
- const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
175
+ const int8x16_t v_xls = vec_sub(v_xl, v_s);
176
+ const int8x16_t v_xhs = vec_sub(v_xh, v_s);
177
177
 
178
- const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
179
- const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
178
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
179
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
180
180
 
181
- const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
182
- const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
183
- const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
184
- const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
181
+ const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
182
+ const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
183
+ const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
184
+ const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
185
185
 
186
- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
186
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
187
187
 
188
- const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
189
- const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
188
+ const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
189
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
190
190
 
191
191
  acc = vec_madd(v_xy, v_d, acc);
192
192
  }
193
193
 
194
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
195
-
194
+ sumf = vec_hsum_f32x4(acc);
196
195
  *s = sumf;
197
196
  #else
198
197
  UNUSED(nb);
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249
248
  acc = vec_madd(v_xy, v_d, acc);
250
249
  }
251
250
 
252
- sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
253
-
251
+ sumf = vec_hsum_f32x4(acc) + summs;
254
252
  *s = sumf;
255
253
  #else
256
254
  UNUSED(nb);
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
351
349
  v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
352
350
  }
353
351
 
354
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
352
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
355
353
 
356
354
  #pragma GCC unroll 4
357
355
  for (; ib < nb; ++ib) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
390
388
  const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
391
389
  const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
392
390
 
393
- sumf += vec_hsum(v_acc);
391
+ sumf += vec_hsum_f32x4(v_acc);
394
392
  }
395
393
 
396
394
  *s = sumf;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
502
500
  v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
503
501
  }
504
502
 
505
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
503
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
506
504
 
507
505
  #pragma GCC unroll 4
508
506
  for (; ib < nb; ++ib) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
543
541
  const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
544
542
  const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
545
543
 
546
- sumf += vec_hsum(v_acc) + summs;
544
+ sumf += vec_hsum_f32x4(v_acc) + summs;
547
545
  }
548
546
 
549
547
  *s = sumf;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
575
573
  float sumf = 0;
576
574
 
577
575
  #if defined(__VXE__) || defined(__VXE2__)
578
- __vector float acc = vec_splats(0.0f);
576
+ float32x4_t acc = vec_splats(0.0f);
579
577
 
580
578
  #pragma GCC unroll 8
581
579
  for (; ib < nb; ++ib) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
594
592
  acc = vec_madd(v_xy, v_d, acc);
595
593
  }
596
594
 
597
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
595
+ sumf = vec_hsum_f32x4(acc);
598
596
 
599
597
  *s = sumf;
600
598
  #else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
718
716
  isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
719
717
  isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
720
718
 
721
- isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
722
- isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
723
- isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
724
- isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
719
+ isum += vec_hsum_i32x4(isum0) * scale[0];
720
+ isum += vec_hsum_i32x4(isum1) * scale[1];
721
+ isum += vec_hsum_i32x4(isum2) * scale[2];
722
+ isum += vec_hsum_i32x4(isum3) * scale[3];
725
723
 
726
724
  scale += 4;
727
725
 
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819
817
  v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
820
818
 
821
819
  const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
822
- sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
820
+ sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
823
821
 
824
822
  v_y[0] = vec_xl(0 , y0);
825
823
  v_y[1] = vec_xl(16, y0);
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
829
827
  v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
830
828
 
831
829
  const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
832
- sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
830
+ sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
833
831
  }
834
832
 
835
833
  sumf += d * (sumi1 + sumi2);
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
911
909
  const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
912
910
  const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
913
911
  const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
914
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
912
+ const int32_t mins = vec_hsum_i32x4(v_mins);
915
913
 
916
914
  const uint8_t * scales = (const uint8_t *)utmp;
917
915
  const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
948
946
  int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
949
947
  int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
950
948
 
951
- sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
952
- sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
949
+ sumi += vec_hsum_i32x4(sumi0) * *scales++;
950
+ sumi += vec_hsum_i32x4(sumi1) * *scales++;
953
951
  }
954
952
 
955
953
  sumf += d * sumi - dmin * mins;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1020
1018
  const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
1021
1019
  const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
1022
1020
 
1023
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
1021
+ const int32_t mins = vec_hsum_i32x4(v_mins);
1024
1022
 
1025
1023
  int32_t isum = 0;
1026
1024
  for (int j = 0; j < QK_K/128; ++j) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1060
1058
  int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1061
1059
  int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1062
1060
 
1063
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1064
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1065
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1066
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1061
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1062
+ vec_hsum_i32x4(summs1) * scale[1] +
1063
+ vec_hsum_i32x4(summs2) * scale[2] +
1064
+ vec_hsum_i32x4(summs3) * scale[3];
1067
1065
 
1068
1066
  scale += 4;
1069
1067
 
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1094
1092
  summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1095
1093
  summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1096
1094
 
1097
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1098
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1099
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1100
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1095
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1096
+ vec_hsum_i32x4(summs1) * scale[1] +
1097
+ vec_hsum_i32x4(summs2) * scale[2] +
1098
+ vec_hsum_i32x4(summs3) * scale[3];
1101
1099
 
1102
1100
  scale += 4;
1103
1101
  }
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1285
1283
  const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1286
1284
  const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1287
1285
 
1288
- sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1286
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1289
1287
  }
1290
1288
 
1291
1289
  *s = sumf;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1354
1352
 
1355
1353
  h >>= 4;
1356
1354
 
1357
- sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
1358
- sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1355
+ sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1356
+ sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1359
1357
  }
1360
1358
 
1361
1359
  sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);