@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/CMakeLists.txt +12 -2
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +8 -9
  4. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  5. package/src/llama.cpp/common/arg.cpp +39 -1001
  6. package/src/llama.cpp/common/arg.h +2 -2
  7. package/src/llama.cpp/common/chat.cpp +216 -2
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +33 -0
  10. package/src/llama.cpp/common/common.h +13 -0
  11. package/src/llama.cpp/common/download.cpp +1054 -0
  12. package/src/llama.cpp/common/download.h +55 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
  14. package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
  15. package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  16. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  27. package/src/llama.cpp/include/llama.h +7 -3
  28. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  29. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  30. package/src/llama.cpp/src/llama-arch.h +11 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  32. package/src/llama.cpp/src/llama-batch.h +12 -1
  33. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  34. package/src/llama.cpp/src/llama-chat.h +1 -0
  35. package/src/llama.cpp/src/llama-context.cpp +44 -16
  36. package/src/llama.cpp/src/llama-context.h +5 -5
  37. package/src/llama.cpp/src/llama-cparams.h +1 -0
  38. package/src/llama.cpp/src/llama-graph.cpp +12 -7
  39. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  40. package/src/llama.cpp/src/llama-hparams.h +6 -0
  41. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  42. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
  43. package/src/llama.cpp/src/llama-kv-cache.h +2 -4
  44. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-model.cpp +350 -13194
  48. package/src/llama.cpp/src/llama-model.h +9 -2
  49. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  50. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  51. package/src/llama.cpp/src/llama-vocab.h +1 -0
  52. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  53. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  54. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  55. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  56. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  57. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  58. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  59. package/src/llama.cpp/src/models/bert.cpp +176 -0
  60. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  61. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  62. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  63. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  64. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  65. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  66. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  67. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  68. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  69. package/src/llama.cpp/src/models/deci.cpp +135 -0
  70. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  71. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  72. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  73. package/src/llama.cpp/src/models/dream.cpp +105 -0
  74. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  75. package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
  76. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  77. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  78. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  79. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  80. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  81. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  82. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  83. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  84. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  85. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  86. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  87. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  88. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  89. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  90. package/src/llama.cpp/src/models/granite.cpp +211 -0
  91. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  92. package/src/llama.cpp/src/models/grok.cpp +159 -0
  93. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  94. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  95. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  96. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  97. package/src/llama.cpp/src/models/jais.cpp +86 -0
  98. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  99. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  100. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  101. package/src/llama.cpp/src/models/llada.cpp +99 -0
  102. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  103. package/src/llama.cpp/src/models/llama.cpp +155 -0
  104. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  105. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  106. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  107. package/src/llama.cpp/src/models/models.h +481 -0
  108. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  109. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  110. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  111. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  112. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  113. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  114. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  115. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
  116. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  117. package/src/llama.cpp/src/models/orion.cpp +123 -0
  118. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  119. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  120. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  121. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  122. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  123. package/src/llama.cpp/src/models/plm.cpp +168 -0
  124. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  125. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  126. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  127. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  128. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  129. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  130. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  131. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  132. package/src/llama.cpp/src/models/refact.cpp +94 -0
  133. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  134. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  135. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  136. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  137. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  138. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  139. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  140. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  141. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  142. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  143. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  144. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  145. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  146. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  147. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -0,0 +1,55 @@
1
+ #pragma once
2
+
3
+ #include <string>
4
+
5
+ struct common_params_model;
6
+
7
+ //
8
+ // download functionalities
9
+ //
10
+
11
+ struct common_cached_model_info {
12
+ std::string manifest_path;
13
+ std::string user;
14
+ std::string model;
15
+ std::string tag;
16
+ size_t size = 0; // GGUF size in bytes
17
+ std::string to_string() const {
18
+ return user + "/" + model + ":" + tag;
19
+ }
20
+ };
21
+
22
+ struct common_hf_file_res {
23
+ std::string repo; // repo name with ":tag" removed
24
+ std::string ggufFile;
25
+ std::string mmprojFile;
26
+ };
27
+
28
+ /**
29
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
30
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
31
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
32
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
33
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
34
+ *
35
+ * Return pair of <repo, file> (with "repo" already having tag removed)
36
+ *
37
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
38
+ */
39
+ common_hf_file_res common_get_hf_file(
40
+ const std::string & hf_repo_with_tag,
41
+ const std::string & bearer_token,
42
+ bool offline);
43
+
44
+ // returns true if download succeeded
45
+ bool common_download_model(
46
+ const common_params_model & model,
47
+ const std::string & bearer_token,
48
+ bool offline);
49
+
50
+ // returns list of cached models
51
+ std::vector<common_cached_model_info> common_list_cached_models();
52
+
53
+ // resolve and download model from Docker registry
54
+ // return local path to downloaded model file
55
+ std::string common_docker_resolve_model(const std::string & docker);
@@ -601,7 +601,10 @@ private:
601
601
  }
602
602
 
603
603
  std::string _resolve_ref(const std::string & ref) {
604
- std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
604
+ auto it = ref.find('#');
605
+ std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
606
+ static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
607
+ std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
605
608
  if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
606
609
  _refs_being_resolved.insert(ref);
607
610
  json resolved = _refs[ref];
@@ -774,11 +777,24 @@ public:
774
777
  std::vector<std::string> tokens = string_split(pointer, "/");
775
778
  for (size_t i = 1; i < tokens.size(); ++i) {
776
779
  std::string sel = tokens[i];
777
- if (target.is_null() || !target.contains(sel)) {
780
+ if (target.is_object() && target.contains(sel)) {
781
+ target = target[sel];
782
+ } else if (target.is_array()) {
783
+ size_t sel_index;
784
+ try {
785
+ sel_index = std::stoul(sel);
786
+ } catch (const std::invalid_argument & e) {
787
+ sel_index = target.size();
788
+ }
789
+ if (sel_index >= target.size()) {
790
+ _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
791
+ return;
792
+ }
793
+ target = target[sel_index];
794
+ } else {
778
795
  _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
779
796
  return;
780
797
  }
781
- target = target[sel];
782
798
  }
783
799
  _refs[ref] = target;
784
800
  }
@@ -168,7 +168,7 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
168
168
  option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
169
169
  option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
170
170
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
171
- option(GGML_VXE "ggml: enable vxe" ON)
171
+ option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
172
172
 
173
173
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
174
174
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
251
251
  set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
252
252
  "gmml: OpenCL API version to target")
253
253
 
254
+ option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
255
+
254
256
  # toolchain for vulkan-shaders-gen
255
257
  set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
256
258
 
@@ -0,0 +1,19 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ // backend API
11
+ GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
12
+
13
+ GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
14
+
15
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
@@ -242,6 +242,7 @@
242
242
  #define GGML_ROPE_TYPE_NEOX 2
243
243
  #define GGML_ROPE_TYPE_MROPE 8
244
244
  #define GGML_ROPE_TYPE_VISION 24
245
+ #define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
245
246
 
246
247
  #define GGML_MROPE_SECTIONS 4
247
248
 
@@ -2107,6 +2108,7 @@ extern "C" {
2107
2108
  enum ggml_scale_mode {
2108
2109
  GGML_SCALE_MODE_NEAREST = 0,
2109
2110
  GGML_SCALE_MODE_BILINEAR = 1,
2111
+ GGML_SCALE_MODE_BICUBIC = 2,
2110
2112
 
2111
2113
  GGML_SCALE_MODE_COUNT
2112
2114
  };
@@ -308,6 +308,10 @@ function(ggml_add_cpu_backend_variant tag_name)
308
308
  set(GGML_INTERNAL_${feat} ON)
309
309
  endforeach()
310
310
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
311
+ foreach (feat VXE2 NNPA)
312
+ set(GGML_INTERNAL_${feat} OFF)
313
+ endforeach()
314
+
311
315
  foreach (feat ${ARGN})
312
316
  set(GGML_INTERNAL_${feat} ON)
313
317
  endforeach()
@@ -377,9 +381,8 @@ if (GGML_CPU_ALL_VARIANTS)
377
381
  endif()
378
382
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
379
383
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
380
- ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
381
- # ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
382
- # ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
384
+ ggml_add_cpu_backend_variant(z15 Z15 VXE2)
385
+ ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
383
386
  else()
384
387
  message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
385
388
  endif()
@@ -402,6 +405,7 @@ ggml_add_backend(Vulkan)
402
405
  ggml_add_backend(WebGPU)
403
406
  ggml_add_backend(zDNN)
404
407
  ggml_add_backend(OpenCL)
408
+ ggml_add_backend(Hexagon)
405
409
 
406
410
  foreach (target ggml-base ggml)
407
411
  target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
@@ -504,11 +504,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
504
504
  endforeach()
505
505
  endif()
506
506
 
507
- if (GGML_VXE OR GGML_INTERNAL_VXE)
508
- message(STATUS "VX/VXE/VXE2 enabled")
507
+ if (GGML_VXE OR GGML_INTERNAL_VXE2)
508
+ message(STATUS "VXE2 enabled")
509
509
  list(APPEND ARCH_FLAGS -mvx -mzvector)
510
- list(APPEND ARCH_DEFINITIONS GGML_VXE)
510
+ list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
511
511
  endif()
512
+
513
+ if (GGML_INTERNAL_NNPA)
514
+ message(STATUS "NNPA enabled")
515
+ list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
516
+ endif()
517
+
518
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
512
519
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
513
520
  message(STATUS "Wasm detected")
514
521
  list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
700
700
  for (; ib + 1 < nb; ib += 2) {
701
701
 
702
702
  // Compute combined scale for the block 0 and 1
703
- const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
703
+ const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
704
+ const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
704
705
 
705
706
  const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
706
707
 
@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
714
715
  bx_1 = __lsx_vsub_b(bx_1, off);
715
716
  const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
716
717
 
717
- //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
718
- //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
719
-
720
718
  // Compute combined scale for the block 2 and 3
721
- const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
719
+ const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
720
+ const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
722
721
 
723
722
  const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
724
723
 
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
580
580
  const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
581
581
  uint8_t *patmp = atmp;
582
582
  int vsums;
583
- int tmp;
583
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
584
584
  __asm__ __volatile__(
585
585
  "vsetivli zero, 16, e8, m1\n\t"
586
586
  "vmv.v.x v8, zero\n\t"
587
+ "lb zero, 15(%[sc])\n\t"
587
588
  "vle8.v v1, (%[sc])\n\t"
589
+ "vle8.v v2, (%[bsums])\n\t"
590
+ "addi %[tmp], %[bsums], 16\n\t"
588
591
  "vand.vi v0, v1, 0xF\n\t"
589
592
  "vsrl.vi v1, v1, 4\n\t"
593
+ "vle8.v v3, (%[tmp])\n\t"
590
594
  "vse8.v v0, (%[scale])\n\t"
591
595
  "vsetivli zero, 16, e16, m2\n\t"
592
- "vle16.v v2, (%[bsums])\n\t"
593
596
  "vzext.vf2 v0, v1\n\t"
594
597
  "vwmul.vv v4, v0, v2\n\t"
595
598
  "vsetivli zero, 16, e32, m4\n\t"
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
608
611
 
609
612
  for (int j = 0; j < QK_K/128; ++j) {
610
613
  __asm__ __volatile__(
611
- "vsetvli zero, %[vl32], e8, m2\n\t"
614
+ "lb zero, 31(%[q2])\n\t"
615
+ "addi %[tmp], %[q2], 16\n\t"
616
+ "addi %[t1], %[q8], 16\n\t"
617
+ "vsetivli zero, 16, e8, m1\n\t"
612
618
  "vle8.v v0, (%[q2])\n\t"
619
+ "vle8.v v1, (%[tmp])\n\t"
613
620
  "vsrl.vi v2, v0, 2\n\t"
621
+ "vsrl.vi v3, v1, 2\n\t"
614
622
  "vsrl.vi v4, v0, 4\n\t"
623
+ "addi %[tmp], %[q8], 32\n\t"
624
+ "vle8.v v8, (%[q8])\n\t"
625
+ "vle8.v v9, (%[t1])\n\t"
626
+ "addi %[t1], %[t1], 32\n\t"
627
+ "vsrl.vi v5, v1, 4\n\t"
615
628
  "vsrl.vi v6, v0, 6\n\t"
629
+ "vsrl.vi v7, v1, 6\n\t"
630
+ "vle8.v v10, (%[tmp])\n\t"
631
+ "vle8.v v11, (%[t1])\n\t"
632
+ "addi %[tmp], %[tmp], 32\n\t"
633
+ "addi %[t1], %[t1], 32\n\t"
616
634
  "vand.vi v0, v0, 0x3\n\t"
635
+ "vand.vi v1, v1, 0x3\n\t"
617
636
  "vand.vi v2, v2, 0x3\n\t"
637
+ "vle8.v v12, (%[tmp])\n\t"
638
+ "vle8.v v13, (%[t1])\n\t"
639
+ "addi %[tmp], %[tmp], 32\n\t"
640
+ "addi %[t1], %[t1], 32\n\t"
641
+ "vand.vi v3, v3, 0x3\n\t"
618
642
  "vand.vi v4, v4, 0x3\n\t"
619
- "vsetvli zero, %[vl128], e8, m8\n\t"
620
- "vle8.v v8, (%[q8])\n\t"
621
- "vsetvli zero, %[vl64], e8, m4\n\t"
643
+ "vand.vi v5, v5, 0x3\n\t"
644
+ "vle8.v v14, (%[tmp])\n\t"
645
+ "vle8.v v15, (%[t1])\n\t"
622
646
  "vwmul.vv v16, v0, v8\n\t"
647
+ "vwmul.vv v18, v1, v9\n\t"
648
+ "vwmul.vv v20, v2, v10\n\t"
649
+ "vwmul.vv v22, v3, v11\n\t"
623
650
  "vwmul.vv v24, v4, v12\n\t"
624
- "vsetivli zero, 16, e16, m2\n\t"
651
+ "vwmul.vv v26, v5, v13\n\t"
652
+ "vwmul.vv v28, v6, v14\n\t"
653
+ "vwmul.vv v30, v7, v15\n\t"
654
+ "vsetivli zero, 8, e16, m1\n\t"
625
655
  "vmv.v.x v0, zero\n\t"
626
- "vwredsum.vs v10, v16, v0\n\t"
656
+ "lbu %[tmp], 0(%[scale])\n\t"
657
+ "vwredsum.vs v8, v16, v0\n\t"
627
658
  "vwredsum.vs v9, v18, v0\n\t"
628
- "vwredsum.vs v8, v20, v0\n\t"
629
- "vwredsum.vs v7, v22, v0\n\t"
630
- "vwredsum.vs v11, v24, v0\n\t"
631
- "vwredsum.vs v12, v26, v0\n\t"
632
- "vwredsum.vs v13, v28, v0\n\t"
633
- "vwredsum.vs v14, v30, v0\n\t"
659
+ "lbu %[t1], 1(%[scale])\n\t"
660
+ "vwredsum.vs v10, v20, v0\n\t"
661
+ "vwredsum.vs v11, v22, v0\n\t"
662
+ "lbu %[t2], 2(%[scale])\n\t"
663
+ "vwredsum.vs v12, v24, v0\n\t"
664
+ "vwredsum.vs v13, v26, v0\n\t"
665
+ "lbu %[t3], 3(%[scale])\n\t"
666
+ "vwredsum.vs v14, v28, v0\n\t"
667
+ "vwredsum.vs v15, v30, v0\n\t"
668
+ "lbu %[t4], 4(%[scale])\n\t"
669
+ "vwredsum.vs v8, v17, v8\n\t"
670
+ "vwredsum.vs v9, v19, v9\n\t"
671
+ "lbu %[t5], 5(%[scale])\n\t"
672
+ "vwredsum.vs v10, v21, v10\n\t"
673
+ "vwredsum.vs v11, v23, v11\n\t"
674
+ "lbu %[t6], 6(%[scale])\n\t"
675
+ "vwredsum.vs v12, v25, v12\n\t"
676
+ "vwredsum.vs v13, v27, v13\n\t"
677
+ "lbu %[t7], 7(%[scale])\n\t"
678
+ "vwredsum.vs v14, v29, v14\n\t"
679
+ "vwredsum.vs v15, v31, v15\n\t"
634
680
  "vsetivli zero, 4, e32, m1\n\t"
635
- "vslideup.vi v10, v9, 1\n\t"
636
- "vslideup.vi v8, v7, 1\n\t"
637
- "vslideup.vi v11, v12, 1\n\t"
638
- "vslideup.vi v13, v14, 1\n\t"
639
- "vslideup.vi v10, v8, 2\n\t"
640
- "vslideup.vi v11, v13, 2\n\t"
641
- "vsetivli zero, 8, e32, m2\n\t"
642
- "vle8.v v15, (%[scale])\n\t"
643
- "vzext.vf4 v12, v15\n\t"
644
- "vmul.vv v10, v10, v12\n\t"
645
- "vredsum.vs v0, v10, v0\n\t"
681
+ "vmul.vx v0, v8, %[tmp]\n\t"
682
+ "vmul.vx v1, v9, %[t1]\n\t"
683
+ "vmacc.vx v0, %[t2], v10\n\t"
684
+ "vmacc.vx v1, %[t3], v11\n\t"
685
+ "vmacc.vx v0, %[t4], v12\n\t"
686
+ "vmacc.vx v1, %[t5], v13\n\t"
687
+ "vmacc.vx v0, %[t6], v14\n\t"
688
+ "vmacc.vx v1, %[t7], v15\n\t"
646
689
  "vmv.x.s %[tmp], v0\n\t"
647
- "add %[isum], %[isum], %[tmp]"
648
- : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
690
+ "vmv.x.s %[t1], v1\n\t"
691
+ "add %[isum], %[isum], %[tmp]\n\t"
692
+ "add %[isum], %[isum], %[t1]"
693
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
694
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
695
+ , [isum] "+&r" (isum)
649
696
  : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
650
- , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
651
697
  : "memory"
652
698
  , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
653
699
  , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
929
975
  const int8_t * restrict q8 = y[i].qs;
930
976
 
931
977
  int8_t * scale = (int8_t *)utmp;
932
- int tmp;
978
+ int tmp, t1, t2, t3, t4, t5, t6, t7;
933
979
  __asm__ __volatile__(
934
980
  "vsetivli zero, 12, e8, m1\n\t"
935
981
  "vle8.v v0, (%[s6b])\n\t"
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
967
1013
  int isum = 0;
968
1014
  for (int j = 0; j < QK_K; j += 128) {
969
1015
  __asm__ __volatile__(
1016
+ "lb zero, 31(%[q3])\n\t"
970
1017
  "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
971
1018
  "vle8.v v8, (%[q3])\n\t"
972
1019
  "vsrl.vi v10, v8, 2\n\t"
973
1020
  "vsrl.vi v12, v8, 4\n\t"
974
1021
  "vsrl.vi v14, v8, 6\n\t"
1022
+ "lb zero, 64(%[q8])\n\t"
975
1023
  "vand.vi v8, v8, 3\n\t"
976
1024
  "vand.vi v10, v10, 3\n\t"
977
1025
  "vand.vi v12, v12, 3\n\t"
978
1026
  "vle8.v v2, (%[qh])\n\t"
1027
+ "lb zero, 127(%[q8])\n\t"
979
1028
  "vand.vx v4, v2, %[m]\n\t"
980
1029
  "slli %[m], %[m], 1\n\t"
981
1030
  "vmseq.vx v0, v4, zero\n\t"
982
1031
  "vadd.vi v8, v8, -4, v0.t\n\t"
1032
+ "lb zero, 0(%[q8])\n\t"
983
1033
  "vand.vx v4, v2, %[m]\n\t"
984
1034
  "slli %[m], %[m], 1\n\t"
985
1035
  "vmseq.vx v0, v4, zero\n\t"
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
994
1044
  "vadd.vi v14, v14, -4, v0.t\n\t"
995
1045
  "vsetvli zero, %[vl128], e8, m8\n\t"
996
1046
  "vle8.v v0, (%[q8])\n\t"
1047
+ "lb %[tmp], 0(%[scale])\n\t"
1048
+ "lb %[t1], 1(%[scale])\n\t"
1049
+ "lb %[t2], 2(%[scale])\n\t"
1050
+ "lb %[t3], 3(%[scale])\n\t"
997
1051
  "vsetvli zero, %[vl64], e8, m4\n\t"
998
1052
  "vwmul.vv v16, v0, v8\n\t"
999
1053
  "vwmul.vv v24, v4, v12\n\t"
1000
1054
  "vsetivli zero, 16, e16, m2\n\t"
1001
1055
  "vmv.v.x v0, zero\n\t"
1002
- "vwredsum.vs v10, v16, v0\n\t"
1056
+ "vwredsum.vs v8, v16, v0\n\t"
1057
+ "lb %[t4], 4(%[scale])\n\t"
1058
+ "lb %[t5], 5(%[scale])\n\t"
1003
1059
  "vwredsum.vs v9, v18, v0\n\t"
1004
- "vwredsum.vs v8, v20, v0\n\t"
1005
- "vwredsum.vs v7, v22, v0\n\t"
1006
- "vwredsum.vs v11, v24, v0\n\t"
1007
- "vwredsum.vs v12, v26, v0\n\t"
1008
- "vwredsum.vs v13, v28, v0\n\t"
1009
- "vwredsum.vs v14, v30, v0\n\t"
1060
+ "vwredsum.vs v10, v20, v0\n\t"
1061
+ "vwredsum.vs v11, v22, v0\n\t"
1062
+ "vwredsum.vs v12, v24, v0\n\t"
1063
+ "lb %[t6], 6(%[scale])\n\t"
1064
+ "lb %[t7], 7(%[scale])\n\t"
1065
+ "vwredsum.vs v13, v26, v0\n\t"
1066
+ "vwredsum.vs v14, v28, v0\n\t"
1067
+ "vwredsum.vs v15, v30, v0\n\t"
1010
1068
  "vsetivli zero, 4, e32, m1\n\t"
1011
- "vslideup.vi v10, v9, 1\n\t"
1012
- "vslideup.vi v8, v7, 1\n\t"
1013
- "vslideup.vi v11, v12, 1\n\t"
1014
- "vslideup.vi v13, v14, 1\n\t"
1015
- "vslideup.vi v10, v8, 2\n\t"
1016
- "vslideup.vi v11, v13, 2\n\t"
1017
- "vsetivli zero, 8, e32, m2\n\t"
1018
- "vle8.v v15, (%[scale])\n\t"
1019
- "vsext.vf4 v12, v15\n\t"
1020
- "vmul.vv v10, v10, v12\n\t"
1021
- "vredsum.vs v0, v10, v0\n\t"
1069
+ "vmul.vx v0, v8, %[tmp]\n\t"
1070
+ "vmul.vx v1, v9, %[t1]\n\t"
1071
+ "vmacc.vx v0, %[t2], v10\n\t"
1072
+ "vmacc.vx v1, %[t3], v11\n\t"
1073
+ "vmacc.vx v0, %[t4], v12\n\t"
1074
+ "vmacc.vx v1, %[t5], v13\n\t"
1075
+ "vmacc.vx v0, %[t6], v14\n\t"
1076
+ "vmacc.vx v1, %[t7], v15\n\t"
1022
1077
  "vmv.x.s %[tmp], v0\n\t"
1023
- "add %[isum], %[isum], %[tmp]"
1024
- : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
1078
+ "vmv.x.s %[t1], v1\n\t"
1079
+ "add %[isum], %[isum], %[tmp]\n\t"
1080
+ "add %[isum], %[isum], %[t1]"
1081
+ : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
1082
+ , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
1083
+ , [m] "+&r" (m), [isum] "+&r" (isum)
1025
1084
  : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
1026
1085
  , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
1027
1086
  : "memory"
@@ -0,0 +1,50 @@
1
+ #include "ggml-backend-impl.h"
2
+
3
+ #if defined(__s390x__)
4
+ #include <sys/auxv.h>
5
+
6
+ // find hwcap bits in asm/elf.h
7
+ #ifndef HWCAP_VXRS_EXT2
8
+ #define HWCAP_VXRS_EXT2 (1 << 15)
9
+ #endif
10
+
11
+ #ifndef HWCAP_NNPA
12
+ #define HWCAP_NNPA (1 << 20)
13
+ #endif
14
+
15
+ struct s390x_features {
16
+ bool has_vxe2 = false;
17
+ bool has_nnpa = false;
18
+
19
+ s390x_features() {
20
+ uint32_t hwcap = getauxval(AT_HWCAP);
21
+ // NOTE: use hwcap2 with DFLT for z17 and later
22
+ // uint32_t hwcap2 = getauxval(AT_HWCAP2);
23
+
24
+ has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25
+ has_nnpa = !!(hwcap & HWCAP_NNPA);
26
+ }
27
+ };
28
+
29
+ static int ggml_backend_cpu_s390x_score() {
30
+ int score = 1;
31
+ s390x_features sf;
32
+
33
+ // IBM z15 / LinuxONE 3
34
+ #ifdef GGML_USE_VXE2
35
+ if (!sf.has_vxe2) { return 0; }
36
+ score += 1 << 1;
37
+ #endif
38
+
39
+ // IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40
+ #ifdef GGML_USE_NNPA
41
+ if (!sf.has_nnpa) { return 0; }
42
+ score += 1 << 2;
43
+ #endif
44
+
45
+ return score;
46
+ }
47
+
48
+ GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49
+
50
+ #endif // __s390x__
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
500
500
 
501
501
  #endif
502
502
 
503
- #if defined(__loongarch_asx)
503
+ #if defined(__loongarch_sx)
504
504
  /* float type data load instructions */
505
505
  static __m128 __lsx_vreplfr2vr_s(const float val) {
506
506
  v4f32 res = {val, val, val, val};
507
507
  return (__m128)res;
508
508
  }
509
+ #endif
509
510
 
511
+ #if defined(__loongarch_asx)
510
512
  static __m256 __lasx_xvreplfr2vr_s(const float val) {
511
513
  v8f32 res = {val, val, val, val, val, val, val, val};
512
514
  return (__m256)res;
@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
1613
1613
  chunk_size = 64;
1614
1614
  }
1615
1615
 
1616
- #if defined(__aarch64__)
1617
- // disable for ARM
1618
- const bool disable_chunking = true;
1619
- #else
1620
1616
  // disable for NUMA
1621
1617
  const bool disable_chunking = ggml_is_numa();
1622
- #endif // defined(__aarch64__)
1623
1618
 
1624
1619
  int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
1625
1620
  int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;