@fugood/llama.node 1.3.0-rc.6 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +12 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +8 -9
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +39 -1001
- package/src/llama.cpp/common/arg.h +2 -2
- package/src/llama.cpp/common/chat.cpp +216 -2
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +33 -0
- package/src/llama.cpp/common/common.h +13 -0
- package/src/llama.cpp/common/download.cpp +1054 -0
- package/src/llama.cpp/common/download.h +55 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +2 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
- package/src/llama.cpp/include/llama.h +7 -3
- package/src/llama.cpp/src/CMakeLists.txt +95 -0
- package/src/llama.cpp/src/llama-arch.cpp +108 -0
- package/src/llama.cpp/src/llama-arch.h +11 -0
- package/src/llama.cpp/src/llama-batch.cpp +63 -31
- package/src/llama.cpp/src/llama-batch.h +12 -1
- package/src/llama.cpp/src/llama-chat.cpp +32 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +44 -16
- package/src/llama.cpp/src/llama-context.h +5 -5
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +12 -7
- package/src/llama.cpp/src/llama-hparams.cpp +11 -1
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
- package/src/llama.cpp/src/llama-kv-cache.h +2 -4
- package/src/llama.cpp/src/llama-kv-cells.h +44 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +350 -13194
- package/src/llama.cpp/src/llama-model.h +9 -2
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +5 -0
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/apertus.cpp +125 -0
- package/src/llama.cpp/src/models/arcee.cpp +135 -0
- package/src/llama.cpp/src/models/arctic.cpp +138 -0
- package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/src/llama.cpp/src/models/baichuan.cpp +122 -0
- package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/src/llama.cpp/src/models/bert.cpp +176 -0
- package/src/llama.cpp/src/models/bitnet.cpp +160 -0
- package/src/llama.cpp/src/models/bloom.cpp +101 -0
- package/src/llama.cpp/src/models/chameleon.cpp +178 -0
- package/src/llama.cpp/src/models/chatglm.cpp +132 -0
- package/src/llama.cpp/src/models/codeshell.cpp +111 -0
- package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/command-r.cpp +122 -0
- package/src/llama.cpp/src/models/dbrx.cpp +123 -0
- package/src/llama.cpp/src/models/deci.cpp +135 -0
- package/src/llama.cpp/src/models/deepseek.cpp +144 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
- package/src/llama.cpp/src/models/dots1.cpp +134 -0
- package/src/llama.cpp/src/models/dream.cpp +105 -0
- package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
- package/src/llama.cpp/src/models/exaone.cpp +114 -0
- package/src/llama.cpp/src/models/exaone4.cpp +123 -0
- package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/src/llama.cpp/src/models/falcon.cpp +120 -0
- package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/src/llama.cpp/src/models/gemma.cpp +112 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/src/llama.cpp/src/models/glm4.cpp +127 -0
- package/src/llama.cpp/src/models/gpt2.cpp +105 -0
- package/src/llama.cpp/src/models/gptneox.cpp +144 -0
- package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/src/llama.cpp/src/models/granite.cpp +211 -0
- package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/src/llama.cpp/src/models/grok.cpp +159 -0
- package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/src/llama.cpp/src/models/internlm2.cpp +120 -0
- package/src/llama.cpp/src/models/jais.cpp +86 -0
- package/src/llama.cpp/src/models/jamba.cpp +106 -0
- package/src/llama.cpp/src/models/lfm2.cpp +173 -0
- package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/src/llama.cpp/src/models/llada.cpp +99 -0
- package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/src/llama.cpp/src/models/llama.cpp +155 -0
- package/src/llama.cpp/src/models/mamba.cpp +55 -0
- package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/src/llama.cpp/src/models/models.h +481 -0
- package/src/llama.cpp/src/models/mpt.cpp +126 -0
- package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/src/llama.cpp/src/models/nemotron.cpp +122 -0
- package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/src/llama.cpp/src/models/olmo.cpp +121 -0
- package/src/llama.cpp/src/models/olmo2.cpp +150 -0
- package/src/llama.cpp/src/models/olmoe.cpp +124 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/openelm.cpp +124 -0
- package/src/llama.cpp/src/models/orion.cpp +123 -0
- package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/src/llama.cpp/src/models/phi2.cpp +121 -0
- package/src/llama.cpp/src/models/phi3.cpp +152 -0
- package/src/llama.cpp/src/models/plamo.cpp +110 -0
- package/src/llama.cpp/src/models/plamo2.cpp +316 -0
- package/src/llama.cpp/src/models/plm.cpp +168 -0
- package/src/llama.cpp/src/models/qwen.cpp +108 -0
- package/src/llama.cpp/src/models/qwen2.cpp +117 -0
- package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3.cpp +117 -0
- package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/src/llama.cpp/src/models/refact.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/src/llama.cpp/src/models/smollm3.cpp +128 -0
- package/src/llama.cpp/src/models/stablelm.cpp +146 -0
- package/src/llama.cpp/src/models/starcoder.cpp +100 -0
- package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/src/llama.cpp/src/models/xverse.cpp +108 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <string>
|
|
4
|
+
|
|
5
|
+
struct common_params_model;
|
|
6
|
+
|
|
7
|
+
//
|
|
8
|
+
// download functionalities
|
|
9
|
+
//
|
|
10
|
+
|
|
11
|
+
struct common_cached_model_info {
|
|
12
|
+
std::string manifest_path;
|
|
13
|
+
std::string user;
|
|
14
|
+
std::string model;
|
|
15
|
+
std::string tag;
|
|
16
|
+
size_t size = 0; // GGUF size in bytes
|
|
17
|
+
std::string to_string() const {
|
|
18
|
+
return user + "/" + model + ":" + tag;
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
struct common_hf_file_res {
|
|
23
|
+
std::string repo; // repo name with ":tag" removed
|
|
24
|
+
std::string ggufFile;
|
|
25
|
+
std::string mmprojFile;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
30
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
31
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
32
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
33
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
34
|
+
*
|
|
35
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
36
|
+
*
|
|
37
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
38
|
+
*/
|
|
39
|
+
common_hf_file_res common_get_hf_file(
|
|
40
|
+
const std::string & hf_repo_with_tag,
|
|
41
|
+
const std::string & bearer_token,
|
|
42
|
+
bool offline);
|
|
43
|
+
|
|
44
|
+
// returns true if download succeeded
|
|
45
|
+
bool common_download_model(
|
|
46
|
+
const common_params_model & model,
|
|
47
|
+
const std::string & bearer_token,
|
|
48
|
+
bool offline);
|
|
49
|
+
|
|
50
|
+
// returns list of cached models
|
|
51
|
+
std::vector<common_cached_model_info> common_list_cached_models();
|
|
52
|
+
|
|
53
|
+
// resolve and download model from Docker registry
|
|
54
|
+
// return local path to downloaded model file
|
|
55
|
+
std::string common_docker_resolve_model(const std::string & docker);
|
|
@@ -601,7 +601,10 @@ private:
|
|
|
601
601
|
}
|
|
602
602
|
|
|
603
603
|
std::string _resolve_ref(const std::string & ref) {
|
|
604
|
-
|
|
604
|
+
auto it = ref.find('#');
|
|
605
|
+
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
|
|
606
|
+
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
|
|
607
|
+
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
|
|
605
608
|
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
|
606
609
|
_refs_being_resolved.insert(ref);
|
|
607
610
|
json resolved = _refs[ref];
|
|
@@ -774,11 +777,24 @@ public:
|
|
|
774
777
|
std::vector<std::string> tokens = string_split(pointer, "/");
|
|
775
778
|
for (size_t i = 1; i < tokens.size(); ++i) {
|
|
776
779
|
std::string sel = tokens[i];
|
|
777
|
-
if (target.
|
|
780
|
+
if (target.is_object() && target.contains(sel)) {
|
|
781
|
+
target = target[sel];
|
|
782
|
+
} else if (target.is_array()) {
|
|
783
|
+
size_t sel_index;
|
|
784
|
+
try {
|
|
785
|
+
sel_index = std::stoul(sel);
|
|
786
|
+
} catch (const std::invalid_argument & e) {
|
|
787
|
+
sel_index = target.size();
|
|
788
|
+
}
|
|
789
|
+
if (sel_index >= target.size()) {
|
|
790
|
+
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
791
|
+
return;
|
|
792
|
+
}
|
|
793
|
+
target = target[sel_index];
|
|
794
|
+
} else {
|
|
778
795
|
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
|
|
779
796
|
return;
|
|
780
797
|
}
|
|
781
|
-
target = target[sel];
|
|
782
798
|
}
|
|
783
799
|
_refs[ref] = target;
|
|
784
800
|
}
|
|
@@ -168,7 +168,7 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
|
168
168
|
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
169
169
|
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
170
170
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
171
|
-
option(GGML_VXE "ggml: enable vxe"
|
|
171
|
+
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
|
172
172
|
|
|
173
173
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
174
174
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adr
|
|
|
251
251
|
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
|
252
252
|
"gmml: OpenCL API version to target")
|
|
253
253
|
|
|
254
|
+
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
|
|
255
|
+
|
|
254
256
|
# toolchain for vulkan-shaders-gen
|
|
255
257
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
|
256
258
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
#ifdef __cplusplus
|
|
7
|
+
extern "C" {
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
// backend API
|
|
11
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
|
|
12
|
+
|
|
13
|
+
GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
|
|
14
|
+
|
|
15
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
|
|
16
|
+
|
|
17
|
+
#ifdef __cplusplus
|
|
18
|
+
}
|
|
19
|
+
#endif
|
|
@@ -242,6 +242,7 @@
|
|
|
242
242
|
#define GGML_ROPE_TYPE_NEOX 2
|
|
243
243
|
#define GGML_ROPE_TYPE_MROPE 8
|
|
244
244
|
#define GGML_ROPE_TYPE_VISION 24
|
|
245
|
+
#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
|
|
245
246
|
|
|
246
247
|
#define GGML_MROPE_SECTIONS 4
|
|
247
248
|
|
|
@@ -2107,6 +2108,7 @@ extern "C" {
|
|
|
2107
2108
|
enum ggml_scale_mode {
|
|
2108
2109
|
GGML_SCALE_MODE_NEAREST = 0,
|
|
2109
2110
|
GGML_SCALE_MODE_BILINEAR = 1,
|
|
2111
|
+
GGML_SCALE_MODE_BICUBIC = 2,
|
|
2110
2112
|
|
|
2111
2113
|
GGML_SCALE_MODE_COUNT
|
|
2112
2114
|
};
|
|
@@ -308,6 +308,10 @@ function(ggml_add_cpu_backend_variant tag_name)
|
|
|
308
308
|
set(GGML_INTERNAL_${feat} ON)
|
|
309
309
|
endforeach()
|
|
310
310
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
311
|
+
foreach (feat VXE2 NNPA)
|
|
312
|
+
set(GGML_INTERNAL_${feat} OFF)
|
|
313
|
+
endforeach()
|
|
314
|
+
|
|
311
315
|
foreach (feat ${ARGN})
|
|
312
316
|
set(GGML_INTERNAL_${feat} ON)
|
|
313
317
|
endforeach()
|
|
@@ -377,9 +381,8 @@ if (GGML_CPU_ALL_VARIANTS)
|
|
|
377
381
|
endif()
|
|
378
382
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
379
383
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
380
|
-
ggml_add_cpu_backend_variant(
|
|
381
|
-
|
|
382
|
-
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
|
|
384
|
+
ggml_add_cpu_backend_variant(z15 Z15 VXE2)
|
|
385
|
+
ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
|
|
383
386
|
else()
|
|
384
387
|
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
|
|
385
388
|
endif()
|
|
@@ -402,6 +405,7 @@ ggml_add_backend(Vulkan)
|
|
|
402
405
|
ggml_add_backend(WebGPU)
|
|
403
406
|
ggml_add_backend(zDNN)
|
|
404
407
|
ggml_add_backend(OpenCL)
|
|
408
|
+
ggml_add_backend(Hexagon)
|
|
405
409
|
|
|
406
410
|
foreach (target ggml-base ggml)
|
|
407
411
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
|
@@ -504,11 +504,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
504
504
|
endforeach()
|
|
505
505
|
endif()
|
|
506
506
|
|
|
507
|
-
if (GGML_VXE OR
|
|
508
|
-
message(STATUS "
|
|
507
|
+
if (GGML_VXE OR GGML_INTERNAL_VXE2)
|
|
508
|
+
message(STATUS "VXE2 enabled")
|
|
509
509
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
510
|
-
list(APPEND ARCH_DEFINITIONS
|
|
510
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
|
|
511
511
|
endif()
|
|
512
|
+
|
|
513
|
+
if (GGML_INTERNAL_NNPA)
|
|
514
|
+
message(STATUS "NNPA enabled")
|
|
515
|
+
list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
|
|
516
|
+
endif()
|
|
517
|
+
|
|
518
|
+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
|
|
512
519
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
513
520
|
message(STATUS "Wasm detected")
|
|
514
521
|
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
|
@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
700
700
|
for (; ib + 1 < nb; ib += 2) {
|
|
701
701
|
|
|
702
702
|
// Compute combined scale for the block 0 and 1
|
|
703
|
-
const
|
|
703
|
+
const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
704
|
+
const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
|
|
704
705
|
|
|
705
706
|
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
|
|
706
707
|
|
|
@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
714
715
|
bx_1 = __lsx_vsub_b(bx_1, off);
|
|
715
716
|
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
|
716
717
|
|
|
717
|
-
//_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
|
|
718
|
-
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
|
719
|
-
|
|
720
718
|
// Compute combined scale for the block 2 and 3
|
|
721
|
-
const
|
|
719
|
+
const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
|
|
720
|
+
const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
|
|
722
721
|
|
|
723
722
|
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
|
|
724
723
|
|
|
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
580
580
|
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
581
581
|
uint8_t *patmp = atmp;
|
|
582
582
|
int vsums;
|
|
583
|
-
int tmp;
|
|
583
|
+
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
|
584
584
|
__asm__ __volatile__(
|
|
585
585
|
"vsetivli zero, 16, e8, m1\n\t"
|
|
586
586
|
"vmv.v.x v8, zero\n\t"
|
|
587
|
+
"lb zero, 15(%[sc])\n\t"
|
|
587
588
|
"vle8.v v1, (%[sc])\n\t"
|
|
589
|
+
"vle8.v v2, (%[bsums])\n\t"
|
|
590
|
+
"addi %[tmp], %[bsums], 16\n\t"
|
|
588
591
|
"vand.vi v0, v1, 0xF\n\t"
|
|
589
592
|
"vsrl.vi v1, v1, 4\n\t"
|
|
593
|
+
"vle8.v v3, (%[tmp])\n\t"
|
|
590
594
|
"vse8.v v0, (%[scale])\n\t"
|
|
591
595
|
"vsetivli zero, 16, e16, m2\n\t"
|
|
592
|
-
"vle16.v v2, (%[bsums])\n\t"
|
|
593
596
|
"vzext.vf2 v0, v1\n\t"
|
|
594
597
|
"vwmul.vv v4, v0, v2\n\t"
|
|
595
598
|
"vsetivli zero, 16, e32, m4\n\t"
|
|
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
608
611
|
|
|
609
612
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
610
613
|
__asm__ __volatile__(
|
|
611
|
-
"
|
|
614
|
+
"lb zero, 31(%[q2])\n\t"
|
|
615
|
+
"addi %[tmp], %[q2], 16\n\t"
|
|
616
|
+
"addi %[t1], %[q8], 16\n\t"
|
|
617
|
+
"vsetivli zero, 16, e8, m1\n\t"
|
|
612
618
|
"vle8.v v0, (%[q2])\n\t"
|
|
619
|
+
"vle8.v v1, (%[tmp])\n\t"
|
|
613
620
|
"vsrl.vi v2, v0, 2\n\t"
|
|
621
|
+
"vsrl.vi v3, v1, 2\n\t"
|
|
614
622
|
"vsrl.vi v4, v0, 4\n\t"
|
|
623
|
+
"addi %[tmp], %[q8], 32\n\t"
|
|
624
|
+
"vle8.v v8, (%[q8])\n\t"
|
|
625
|
+
"vle8.v v9, (%[t1])\n\t"
|
|
626
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
627
|
+
"vsrl.vi v5, v1, 4\n\t"
|
|
615
628
|
"vsrl.vi v6, v0, 6\n\t"
|
|
629
|
+
"vsrl.vi v7, v1, 6\n\t"
|
|
630
|
+
"vle8.v v10, (%[tmp])\n\t"
|
|
631
|
+
"vle8.v v11, (%[t1])\n\t"
|
|
632
|
+
"addi %[tmp], %[tmp], 32\n\t"
|
|
633
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
616
634
|
"vand.vi v0, v0, 0x3\n\t"
|
|
635
|
+
"vand.vi v1, v1, 0x3\n\t"
|
|
617
636
|
"vand.vi v2, v2, 0x3\n\t"
|
|
637
|
+
"vle8.v v12, (%[tmp])\n\t"
|
|
638
|
+
"vle8.v v13, (%[t1])\n\t"
|
|
639
|
+
"addi %[tmp], %[tmp], 32\n\t"
|
|
640
|
+
"addi %[t1], %[t1], 32\n\t"
|
|
641
|
+
"vand.vi v3, v3, 0x3\n\t"
|
|
618
642
|
"vand.vi v4, v4, 0x3\n\t"
|
|
619
|
-
"
|
|
620
|
-
"vle8.v
|
|
621
|
-
"
|
|
643
|
+
"vand.vi v5, v5, 0x3\n\t"
|
|
644
|
+
"vle8.v v14, (%[tmp])\n\t"
|
|
645
|
+
"vle8.v v15, (%[t1])\n\t"
|
|
622
646
|
"vwmul.vv v16, v0, v8\n\t"
|
|
647
|
+
"vwmul.vv v18, v1, v9\n\t"
|
|
648
|
+
"vwmul.vv v20, v2, v10\n\t"
|
|
649
|
+
"vwmul.vv v22, v3, v11\n\t"
|
|
623
650
|
"vwmul.vv v24, v4, v12\n\t"
|
|
624
|
-
"
|
|
651
|
+
"vwmul.vv v26, v5, v13\n\t"
|
|
652
|
+
"vwmul.vv v28, v6, v14\n\t"
|
|
653
|
+
"vwmul.vv v30, v7, v15\n\t"
|
|
654
|
+
"vsetivli zero, 8, e16, m1\n\t"
|
|
625
655
|
"vmv.v.x v0, zero\n\t"
|
|
626
|
-
"
|
|
656
|
+
"lbu %[tmp], 0(%[scale])\n\t"
|
|
657
|
+
"vwredsum.vs v8, v16, v0\n\t"
|
|
627
658
|
"vwredsum.vs v9, v18, v0\n\t"
|
|
628
|
-
"
|
|
629
|
-
"vwredsum.vs
|
|
630
|
-
"vwredsum.vs v11,
|
|
631
|
-
"
|
|
632
|
-
"vwredsum.vs
|
|
633
|
-
"vwredsum.vs
|
|
659
|
+
"lbu %[t1], 1(%[scale])\n\t"
|
|
660
|
+
"vwredsum.vs v10, v20, v0\n\t"
|
|
661
|
+
"vwredsum.vs v11, v22, v0\n\t"
|
|
662
|
+
"lbu %[t2], 2(%[scale])\n\t"
|
|
663
|
+
"vwredsum.vs v12, v24, v0\n\t"
|
|
664
|
+
"vwredsum.vs v13, v26, v0\n\t"
|
|
665
|
+
"lbu %[t3], 3(%[scale])\n\t"
|
|
666
|
+
"vwredsum.vs v14, v28, v0\n\t"
|
|
667
|
+
"vwredsum.vs v15, v30, v0\n\t"
|
|
668
|
+
"lbu %[t4], 4(%[scale])\n\t"
|
|
669
|
+
"vwredsum.vs v8, v17, v8\n\t"
|
|
670
|
+
"vwredsum.vs v9, v19, v9\n\t"
|
|
671
|
+
"lbu %[t5], 5(%[scale])\n\t"
|
|
672
|
+
"vwredsum.vs v10, v21, v10\n\t"
|
|
673
|
+
"vwredsum.vs v11, v23, v11\n\t"
|
|
674
|
+
"lbu %[t6], 6(%[scale])\n\t"
|
|
675
|
+
"vwredsum.vs v12, v25, v12\n\t"
|
|
676
|
+
"vwredsum.vs v13, v27, v13\n\t"
|
|
677
|
+
"lbu %[t7], 7(%[scale])\n\t"
|
|
678
|
+
"vwredsum.vs v14, v29, v14\n\t"
|
|
679
|
+
"vwredsum.vs v15, v31, v15\n\t"
|
|
634
680
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
635
|
-
"
|
|
636
|
-
"
|
|
637
|
-
"
|
|
638
|
-
"
|
|
639
|
-
"
|
|
640
|
-
"
|
|
641
|
-
"
|
|
642
|
-
"
|
|
643
|
-
"vzext.vf4 v12, v15\n\t"
|
|
644
|
-
"vmul.vv v10, v10, v12\n\t"
|
|
645
|
-
"vredsum.vs v0, v10, v0\n\t"
|
|
681
|
+
"vmul.vx v0, v8, %[tmp]\n\t"
|
|
682
|
+
"vmul.vx v1, v9, %[t1]\n\t"
|
|
683
|
+
"vmacc.vx v0, %[t2], v10\n\t"
|
|
684
|
+
"vmacc.vx v1, %[t3], v11\n\t"
|
|
685
|
+
"vmacc.vx v0, %[t4], v12\n\t"
|
|
686
|
+
"vmacc.vx v1, %[t5], v13\n\t"
|
|
687
|
+
"vmacc.vx v0, %[t6], v14\n\t"
|
|
688
|
+
"vmacc.vx v1, %[t7], v15\n\t"
|
|
646
689
|
"vmv.x.s %[tmp], v0\n\t"
|
|
647
|
-
"
|
|
648
|
-
|
|
690
|
+
"vmv.x.s %[t1], v1\n\t"
|
|
691
|
+
"add %[isum], %[isum], %[tmp]\n\t"
|
|
692
|
+
"add %[isum], %[isum], %[t1]"
|
|
693
|
+
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
|
694
|
+
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
|
695
|
+
, [isum] "+&r" (isum)
|
|
649
696
|
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
|
650
|
-
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
|
651
697
|
: "memory"
|
|
652
698
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
|
653
699
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
|
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
929
975
|
const int8_t * restrict q8 = y[i].qs;
|
|
930
976
|
|
|
931
977
|
int8_t * scale = (int8_t *)utmp;
|
|
932
|
-
int tmp;
|
|
978
|
+
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
|
933
979
|
__asm__ __volatile__(
|
|
934
980
|
"vsetivli zero, 12, e8, m1\n\t"
|
|
935
981
|
"vle8.v v0, (%[s6b])\n\t"
|
|
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
967
1013
|
int isum = 0;
|
|
968
1014
|
for (int j = 0; j < QK_K; j += 128) {
|
|
969
1015
|
__asm__ __volatile__(
|
|
1016
|
+
"lb zero, 31(%[q3])\n\t"
|
|
970
1017
|
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
|
|
971
1018
|
"vle8.v v8, (%[q3])\n\t"
|
|
972
1019
|
"vsrl.vi v10, v8, 2\n\t"
|
|
973
1020
|
"vsrl.vi v12, v8, 4\n\t"
|
|
974
1021
|
"vsrl.vi v14, v8, 6\n\t"
|
|
1022
|
+
"lb zero, 64(%[q8])\n\t"
|
|
975
1023
|
"vand.vi v8, v8, 3\n\t"
|
|
976
1024
|
"vand.vi v10, v10, 3\n\t"
|
|
977
1025
|
"vand.vi v12, v12, 3\n\t"
|
|
978
1026
|
"vle8.v v2, (%[qh])\n\t"
|
|
1027
|
+
"lb zero, 127(%[q8])\n\t"
|
|
979
1028
|
"vand.vx v4, v2, %[m]\n\t"
|
|
980
1029
|
"slli %[m], %[m], 1\n\t"
|
|
981
1030
|
"vmseq.vx v0, v4, zero\n\t"
|
|
982
1031
|
"vadd.vi v8, v8, -4, v0.t\n\t"
|
|
1032
|
+
"lb zero, 0(%[q8])\n\t"
|
|
983
1033
|
"vand.vx v4, v2, %[m]\n\t"
|
|
984
1034
|
"slli %[m], %[m], 1\n\t"
|
|
985
1035
|
"vmseq.vx v0, v4, zero\n\t"
|
|
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
994
1044
|
"vadd.vi v14, v14, -4, v0.t\n\t"
|
|
995
1045
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
|
996
1046
|
"vle8.v v0, (%[q8])\n\t"
|
|
1047
|
+
"lb %[tmp], 0(%[scale])\n\t"
|
|
1048
|
+
"lb %[t1], 1(%[scale])\n\t"
|
|
1049
|
+
"lb %[t2], 2(%[scale])\n\t"
|
|
1050
|
+
"lb %[t3], 3(%[scale])\n\t"
|
|
997
1051
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
|
998
1052
|
"vwmul.vv v16, v0, v8\n\t"
|
|
999
1053
|
"vwmul.vv v24, v4, v12\n\t"
|
|
1000
1054
|
"vsetivli zero, 16, e16, m2\n\t"
|
|
1001
1055
|
"vmv.v.x v0, zero\n\t"
|
|
1002
|
-
"vwredsum.vs
|
|
1056
|
+
"vwredsum.vs v8, v16, v0\n\t"
|
|
1057
|
+
"lb %[t4], 4(%[scale])\n\t"
|
|
1058
|
+
"lb %[t5], 5(%[scale])\n\t"
|
|
1003
1059
|
"vwredsum.vs v9, v18, v0\n\t"
|
|
1004
|
-
"vwredsum.vs
|
|
1005
|
-
"vwredsum.vs
|
|
1006
|
-
"vwredsum.vs
|
|
1007
|
-
"
|
|
1008
|
-
"
|
|
1009
|
-
"vwredsum.vs
|
|
1060
|
+
"vwredsum.vs v10, v20, v0\n\t"
|
|
1061
|
+
"vwredsum.vs v11, v22, v0\n\t"
|
|
1062
|
+
"vwredsum.vs v12, v24, v0\n\t"
|
|
1063
|
+
"lb %[t6], 6(%[scale])\n\t"
|
|
1064
|
+
"lb %[t7], 7(%[scale])\n\t"
|
|
1065
|
+
"vwredsum.vs v13, v26, v0\n\t"
|
|
1066
|
+
"vwredsum.vs v14, v28, v0\n\t"
|
|
1067
|
+
"vwredsum.vs v15, v30, v0\n\t"
|
|
1010
1068
|
"vsetivli zero, 4, e32, m1\n\t"
|
|
1011
|
-
"
|
|
1012
|
-
"
|
|
1013
|
-
"
|
|
1014
|
-
"
|
|
1015
|
-
"
|
|
1016
|
-
"
|
|
1017
|
-
"
|
|
1018
|
-
"
|
|
1019
|
-
"vsext.vf4 v12, v15\n\t"
|
|
1020
|
-
"vmul.vv v10, v10, v12\n\t"
|
|
1021
|
-
"vredsum.vs v0, v10, v0\n\t"
|
|
1069
|
+
"vmul.vx v0, v8, %[tmp]\n\t"
|
|
1070
|
+
"vmul.vx v1, v9, %[t1]\n\t"
|
|
1071
|
+
"vmacc.vx v0, %[t2], v10\n\t"
|
|
1072
|
+
"vmacc.vx v1, %[t3], v11\n\t"
|
|
1073
|
+
"vmacc.vx v0, %[t4], v12\n\t"
|
|
1074
|
+
"vmacc.vx v1, %[t5], v13\n\t"
|
|
1075
|
+
"vmacc.vx v0, %[t6], v14\n\t"
|
|
1076
|
+
"vmacc.vx v1, %[t7], v15\n\t"
|
|
1022
1077
|
"vmv.x.s %[tmp], v0\n\t"
|
|
1023
|
-
"
|
|
1024
|
-
|
|
1078
|
+
"vmv.x.s %[t1], v1\n\t"
|
|
1079
|
+
"add %[isum], %[isum], %[tmp]\n\t"
|
|
1080
|
+
"add %[isum], %[isum], %[t1]"
|
|
1081
|
+
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
|
1082
|
+
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
|
1083
|
+
, [m] "+&r" (m), [isum] "+&r" (isum)
|
|
1025
1084
|
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
|
1026
1085
|
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
|
1027
1086
|
: "memory"
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#include "ggml-backend-impl.h"
|
|
2
|
+
|
|
3
|
+
#if defined(__s390x__)
|
|
4
|
+
#include <sys/auxv.h>
|
|
5
|
+
|
|
6
|
+
// find hwcap bits in asm/elf.h
|
|
7
|
+
#ifndef HWCAP_VXRS_EXT2
|
|
8
|
+
#define HWCAP_VXRS_EXT2 (1 << 15)
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
#ifndef HWCAP_NNPA
|
|
12
|
+
#define HWCAP_NNPA (1 << 20)
|
|
13
|
+
#endif
|
|
14
|
+
|
|
15
|
+
struct s390x_features {
|
|
16
|
+
bool has_vxe2 = false;
|
|
17
|
+
bool has_nnpa = false;
|
|
18
|
+
|
|
19
|
+
s390x_features() {
|
|
20
|
+
uint32_t hwcap = getauxval(AT_HWCAP);
|
|
21
|
+
// NOTE: use hwcap2 with DFLT for z17 and later
|
|
22
|
+
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
|
23
|
+
|
|
24
|
+
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
|
|
25
|
+
has_nnpa = !!(hwcap & HWCAP_NNPA);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
static int ggml_backend_cpu_s390x_score() {
|
|
30
|
+
int score = 1;
|
|
31
|
+
s390x_features sf;
|
|
32
|
+
|
|
33
|
+
// IBM z15 / LinuxONE 3
|
|
34
|
+
#ifdef GGML_USE_VXE2
|
|
35
|
+
if (!sf.has_vxe2) { return 0; }
|
|
36
|
+
score += 1 << 1;
|
|
37
|
+
#endif
|
|
38
|
+
|
|
39
|
+
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
|
|
40
|
+
#ifdef GGML_USE_NNPA
|
|
41
|
+
if (!sf.has_nnpa) { return 0; }
|
|
42
|
+
score += 1 << 2;
|
|
43
|
+
#endif
|
|
44
|
+
|
|
45
|
+
return score;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
|
|
49
|
+
|
|
50
|
+
#endif // __s390x__
|
|
@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
|
500
500
|
|
|
501
501
|
#endif
|
|
502
502
|
|
|
503
|
-
#if defined(
|
|
503
|
+
#if defined(__loongarch_sx)
|
|
504
504
|
/* float type data load instructions */
|
|
505
505
|
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
|
506
506
|
v4f32 res = {val, val, val, val};
|
|
507
507
|
return (__m128)res;
|
|
508
508
|
}
|
|
509
|
+
#endif
|
|
509
510
|
|
|
511
|
+
#if defined(__loongarch_asx)
|
|
510
512
|
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
|
511
513
|
v8f32 res = {val, val, val, val, val, val, val, val};
|
|
512
514
|
return (__m256)res;
|
|
@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
1613
1613
|
chunk_size = 64;
|
|
1614
1614
|
}
|
|
1615
1615
|
|
|
1616
|
-
#if defined(__aarch64__)
|
|
1617
|
-
// disable for ARM
|
|
1618
|
-
const bool disable_chunking = true;
|
|
1619
|
-
#else
|
|
1620
1616
|
// disable for NUMA
|
|
1621
1617
|
const bool disable_chunking = ggml_is_numa();
|
|
1622
|
-
#endif // defined(__aarch64__)
|
|
1623
1618
|
|
|
1624
1619
|
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
|
1625
1620
|
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|