@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include <cmath>
|
|
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
|
|
|
39
40
|
float tmp[16];
|
|
40
41
|
|
|
41
42
|
for (int i = 0; i < 8; i++) {
|
|
42
|
-
tmp[i] =
|
|
43
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
43
44
|
}
|
|
44
45
|
|
|
45
46
|
for (int i = 0; i < 8; i++) {
|
|
46
|
-
tmp[i + 8] =
|
|
47
|
+
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
return _mm512_loadu_ps(tmp);
|
|
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
|
|
|
54
55
|
_mm_storeu_si128((__m128i*)tmphalf, x);
|
|
55
56
|
|
|
56
57
|
for (int i = 0; i < 4; i++) {
|
|
57
|
-
tmp[i] =
|
|
58
|
-
tmp[i + 4] =
|
|
59
|
-
tmp[i + 8] =
|
|
60
|
-
tmp[i + 12] =
|
|
58
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
59
|
+
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
60
|
+
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
|
+
tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
61
62
|
}
|
|
62
63
|
|
|
63
64
|
return _mm512_loadu_ps(tmp);
|
|
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
|
|
67
68
|
float tmp[8];
|
|
68
69
|
|
|
69
70
|
for (int i = 0; i < 8; i++) {
|
|
70
|
-
tmp[i] =
|
|
71
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
return _mm256_loadu_ps(tmp);
|
|
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
|
|
|
76
77
|
float tmp[8];
|
|
77
78
|
|
|
78
79
|
for (int i = 0; i < 4; i++) {
|
|
79
|
-
tmp[i] =
|
|
80
|
-
tmp[i + 4] =
|
|
80
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
|
+
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
81
82
|
}
|
|
82
83
|
|
|
83
84
|
return _mm256_loadu_ps(tmp);
|
|
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
|
|
|
88
89
|
|
|
89
90
|
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
|
|
90
91
|
for (int i = 0; i < 8; i++) {
|
|
91
|
-
tmp[i] =
|
|
92
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
|
92
93
|
}
|
|
93
94
|
|
|
94
95
|
return _mm256_loadu_ps(tmp);
|
|
@@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
211
212
|
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
|
|
212
213
|
|
|
213
214
|
// Store the scale for the individual block
|
|
214
|
-
y[i].d[row_iter] =
|
|
215
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
215
216
|
|
|
216
217
|
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
|
|
217
218
|
srcv[row_iter][0] = v0;
|
|
@@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
297
298
|
const float d = amax / ((1 << 7) - 1);
|
|
298
299
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
299
300
|
|
|
300
|
-
y[i].d[row_iter] =
|
|
301
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
301
302
|
}
|
|
302
303
|
|
|
303
304
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
647
648
|
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
|
648
649
|
|
|
649
650
|
// Load and convert to FP32 scale from block_q8_0
|
|
650
|
-
const __m256 row_scale_f32 = _mm256_set1_ps(
|
|
651
|
+
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
|
|
651
652
|
|
|
652
653
|
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
|
|
653
654
|
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
|
|
@@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
706
707
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
707
708
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
708
709
|
}
|
|
709
|
-
sumf[j] += sumi *
|
|
710
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
710
711
|
}
|
|
711
712
|
}
|
|
712
713
|
}
|
|
@@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
972
973
|
sumi2 = sumi2 * scales_1[j];
|
|
973
974
|
sumi += sumi1 + sumi2;
|
|
974
975
|
}
|
|
975
|
-
sumf[j] += sumi *
|
|
976
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
976
977
|
}
|
|
977
978
|
}
|
|
978
979
|
for (int sb = 0; sb < 8; sb++) {
|
|
979
980
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
980
981
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
981
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
982
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
982
983
|
}
|
|
983
984
|
}
|
|
984
985
|
}
|
|
@@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1755
1756
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1756
1757
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1757
1758
|
}
|
|
1758
|
-
sumf[m][j] += sumi *
|
|
1759
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1759
1760
|
}
|
|
1760
1761
|
}
|
|
1761
1762
|
}
|
|
@@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3259
3260
|
sumi2 = sumi2 * scales_1[j];
|
|
3260
3261
|
sumi += sumi1 + sumi2;
|
|
3261
3262
|
}
|
|
3262
|
-
sumf[m][j] += sumi *
|
|
3263
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
3263
3264
|
}
|
|
3264
3265
|
}
|
|
3265
3266
|
}
|
|
@@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3268
3269
|
for(int m = 0; m < 4; m++) {
|
|
3269
3270
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
3270
3271
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
3271
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
3272
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
3272
3273
|
}
|
|
3273
3274
|
}
|
|
3274
3275
|
}
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
#include "traits.h"
|
|
5
5
|
#include "ggml-cpu-impl.h"
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
|
+
#include "simd-mappings.h"
|
|
7
8
|
|
|
8
9
|
#ifdef __cplusplus
|
|
9
10
|
|
|
@@ -12,11 +13,11 @@
|
|
|
12
13
|
// convenience functions/macros for use in template calls
|
|
13
14
|
// note: these won't be required after the 'traits' lookup table is used.
|
|
14
15
|
static inline ggml_fp16_t f32_to_f16(float x) {
|
|
15
|
-
return
|
|
16
|
+
return GGML_CPU_FP32_TO_FP16(x);
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
static inline float f16_to_f32(ggml_fp16_t x) {
|
|
19
|
-
return
|
|
20
|
+
return GGML_CPU_FP16_TO_FP32(x);
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
static inline ggml_bf16_t f32_to_bf16(float x) {
|
|
@@ -62,11 +62,17 @@ struct ggml_compute_params {
|
|
|
62
62
|
#if defined(__s390x__) && defined(__VEC__)
|
|
63
63
|
#ifndef __VXE__
|
|
64
64
|
#define __VXE__
|
|
65
|
-
#endif
|
|
65
|
+
#endif // __VXE__
|
|
66
66
|
#ifndef __VXE2__
|
|
67
67
|
#define __VXE2__
|
|
68
|
-
#endif
|
|
69
|
-
#endif
|
|
68
|
+
#endif // __VXE2__
|
|
69
|
+
#endif // __s390x__ && __VEC__
|
|
70
|
+
|
|
71
|
+
#if defined(__s390x__) && defined(GGML_NNPA)
|
|
72
|
+
#ifndef __NNPA__
|
|
73
|
+
#define __NNPA__
|
|
74
|
+
#endif // __NNPA__
|
|
75
|
+
#endif // __s390x__ && GGML_NNPA
|
|
70
76
|
|
|
71
77
|
#if defined(__ARM_FEATURE_SVE)
|
|
72
78
|
#include <sys/prctl.h>
|
|
@@ -72,15 +72,13 @@
|
|
|
72
72
|
#define UNUSED GGML_UNUSED
|
|
73
73
|
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
|
|
74
74
|
|
|
75
|
+
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
|
76
|
+
float ggml_table_f32_f16[1 << 16];
|
|
77
|
+
|
|
75
78
|
#if defined(__ARM_ARCH)
|
|
76
79
|
struct ggml_arm_arch_features_type {
|
|
77
|
-
int has_neon;
|
|
78
|
-
int has_dotprod;
|
|
79
|
-
int has_i8mm;
|
|
80
|
-
int has_sve;
|
|
81
80
|
int sve_cnt;
|
|
82
|
-
|
|
83
|
-
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
|
|
81
|
+
} ggml_arm_arch_features = { 0 };
|
|
84
82
|
#endif
|
|
85
83
|
|
|
86
84
|
|
|
@@ -197,6 +195,7 @@ typedef pthread_t ggml_thread_t;
|
|
|
197
195
|
|
|
198
196
|
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
199
197
|
[GGML_TYPE_F32] = {
|
|
198
|
+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
|
|
200
199
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
|
201
200
|
.vec_dot_type = GGML_TYPE_F32,
|
|
202
201
|
.nrows = 1,
|
|
@@ -678,87 +677,15 @@ bool ggml_is_numa(void) {
|
|
|
678
677
|
|
|
679
678
|
#if defined(__linux__) && defined(__aarch64__)
|
|
680
679
|
#include <sys/auxv.h>
|
|
681
|
-
#elif defined(__APPLE__)
|
|
682
|
-
#include <sys/sysctl.h>
|
|
683
|
-
#endif
|
|
684
|
-
|
|
685
|
-
#if !defined(HWCAP2_I8MM)
|
|
686
|
-
#define HWCAP2_I8MM (1 << 13)
|
|
687
|
-
#endif
|
|
688
|
-
|
|
689
|
-
#if !defined(HWCAP2_SME)
|
|
690
|
-
#define HWCAP2_SME (1 << 23)
|
|
691
680
|
#endif
|
|
692
681
|
|
|
693
682
|
static void ggml_init_arm_arch_features(void) {
|
|
694
|
-
#if defined(__linux__) && defined(__aarch64__)
|
|
695
|
-
uint32_t hwcap = getauxval(AT_HWCAP);
|
|
696
|
-
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
|
697
|
-
|
|
698
|
-
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
|
699
|
-
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
|
700
|
-
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
|
701
|
-
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
|
702
|
-
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
|
|
703
|
-
|
|
704
|
-
#if defined(__ARM_FEATURE_SVE)
|
|
683
|
+
#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
|
705
684
|
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
|
706
685
|
#endif
|
|
707
|
-
#elif defined(__APPLE__)
|
|
708
|
-
int oldp = 0;
|
|
709
|
-
size_t size = sizeof(oldp);
|
|
710
|
-
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
|
|
711
|
-
oldp = 0;
|
|
712
|
-
}
|
|
713
|
-
ggml_arm_arch_features.has_neon = oldp;
|
|
714
|
-
|
|
715
|
-
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
|
716
|
-
oldp = 0;
|
|
717
|
-
}
|
|
718
|
-
ggml_arm_arch_features.has_dotprod = oldp;
|
|
719
|
-
|
|
720
|
-
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
|
721
|
-
oldp = 0;
|
|
722
|
-
}
|
|
723
|
-
ggml_arm_arch_features.has_i8mm = oldp;
|
|
724
|
-
|
|
725
|
-
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
|
|
726
|
-
oldp = 0;
|
|
727
|
-
}
|
|
728
|
-
ggml_arm_arch_features.has_sme = oldp;
|
|
729
|
-
|
|
730
|
-
ggml_arm_arch_features.has_sve = 0;
|
|
731
|
-
ggml_arm_arch_features.sve_cnt = 0;
|
|
732
|
-
#else
|
|
733
|
-
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
|
|
734
|
-
#if defined(__ARM_NEON)
|
|
735
|
-
ggml_arm_arch_features.has_neon = 1;
|
|
736
|
-
#else
|
|
737
|
-
ggml_arm_arch_features.has_neon = 0;
|
|
738
|
-
#endif
|
|
739
|
-
|
|
740
|
-
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
|
741
|
-
ggml_arm_arch_features.has_i8mm = 1;
|
|
742
|
-
#else
|
|
743
|
-
ggml_arm_arch_features.has_i8mm = 0;
|
|
744
|
-
#endif
|
|
745
|
-
|
|
746
|
-
#if defined(__ARM_FEATURE_SVE)
|
|
747
|
-
ggml_arm_arch_features.has_sve = 1;
|
|
748
|
-
ggml_arm_arch_features.sve_cnt = 16;
|
|
749
|
-
#else
|
|
750
|
-
ggml_arm_arch_features.has_sve = 0;
|
|
751
|
-
ggml_arm_arch_features.sve_cnt = 0;
|
|
752
|
-
#endif
|
|
753
|
-
|
|
754
|
-
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
|
|
755
|
-
ggml_arm_arch_features.has_sme = 1;
|
|
756
|
-
#else
|
|
757
|
-
ggml_arm_arch_features.has_sme = 0;
|
|
758
|
-
#endif
|
|
759
|
-
#endif
|
|
760
686
|
}
|
|
761
|
-
|
|
687
|
+
|
|
688
|
+
#endif // __ARM_ARCH
|
|
762
689
|
|
|
763
690
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
|
764
691
|
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
|
@@ -813,7 +740,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
|
|
813
740
|
{
|
|
814
741
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
815
742
|
for (int i = 0; i < n; i++) {
|
|
816
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
|
|
743
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
|
817
744
|
}
|
|
818
745
|
} break;
|
|
819
746
|
case GGML_TYPE_BF16:
|
|
@@ -872,7 +799,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
|
|
872
799
|
{
|
|
873
800
|
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
874
801
|
for (int i = 0; i < n; i++) {
|
|
875
|
-
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1),
|
|
802
|
+
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
|
876
803
|
}
|
|
877
804
|
} break;
|
|
878
805
|
case GGML_TYPE_BF16:
|
|
@@ -923,7 +850,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
923
850
|
case GGML_TYPE_F16:
|
|
924
851
|
{
|
|
925
852
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
926
|
-
return
|
|
853
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
927
854
|
}
|
|
928
855
|
case GGML_TYPE_BF16:
|
|
929
856
|
{
|
|
@@ -968,7 +895,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
|
|
968
895
|
case GGML_TYPE_F16:
|
|
969
896
|
{
|
|
970
897
|
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
971
|
-
((ggml_fp16_t *)(tensor->data))[i] =
|
|
898
|
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
|
972
899
|
} break;
|
|
973
900
|
case GGML_TYPE_BF16:
|
|
974
901
|
{
|
|
@@ -997,7 +924,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
|
|
|
997
924
|
case GGML_TYPE_I32:
|
|
998
925
|
return ((int32_t *) data)[0];
|
|
999
926
|
case GGML_TYPE_F16:
|
|
1000
|
-
return
|
|
927
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
|
1001
928
|
case GGML_TYPE_BF16:
|
|
1002
929
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
|
1003
930
|
case GGML_TYPE_F32:
|
|
@@ -1024,7 +951,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
1024
951
|
} break;
|
|
1025
952
|
case GGML_TYPE_F16:
|
|
1026
953
|
{
|
|
1027
|
-
((ggml_fp16_t *)(data))[0] =
|
|
954
|
+
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
|
1028
955
|
} break;
|
|
1029
956
|
case GGML_TYPE_BF16:
|
|
1030
957
|
{
|
|
@@ -1062,7 +989,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
1062
989
|
}
|
|
1063
990
|
case GGML_TYPE_F16:
|
|
1064
991
|
{
|
|
1065
|
-
return
|
|
992
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
1066
993
|
}
|
|
1067
994
|
case GGML_TYPE_BF16:
|
|
1068
995
|
{
|
|
@@ -1101,7 +1028,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
|
1101
1028
|
} break;
|
|
1102
1029
|
case GGML_TYPE_F16:
|
|
1103
1030
|
{
|
|
1104
|
-
((ggml_fp16_t *)(tensor->data))[i] =
|
|
1031
|
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
|
1105
1032
|
} break;
|
|
1106
1033
|
case GGML_TYPE_BF16:
|
|
1107
1034
|
{
|
|
@@ -1128,7 +1055,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
1128
1055
|
case GGML_TYPE_I32:
|
|
1129
1056
|
return ((int32_t *) data)[0];
|
|
1130
1057
|
case GGML_TYPE_F16:
|
|
1131
|
-
return
|
|
1058
|
+
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
|
1132
1059
|
case GGML_TYPE_BF16:
|
|
1133
1060
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
|
1134
1061
|
case GGML_TYPE_F32:
|
|
@@ -1155,7 +1082,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
|
|
1155
1082
|
} break;
|
|
1156
1083
|
case GGML_TYPE_F16:
|
|
1157
1084
|
{
|
|
1158
|
-
((ggml_fp16_t *)(data))[0] =
|
|
1085
|
+
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
|
1159
1086
|
} break;
|
|
1160
1087
|
case GGML_TYPE_BF16:
|
|
1161
1088
|
{
|
|
@@ -1891,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1891
1818
|
{
|
|
1892
1819
|
ggml_compute_forward_get_rows_back(params, tensor);
|
|
1893
1820
|
} break;
|
|
1821
|
+
case GGML_OP_SET_ROWS:
|
|
1822
|
+
{
|
|
1823
|
+
ggml_compute_forward_set_rows(params, tensor);
|
|
1824
|
+
} break;
|
|
1894
1825
|
case GGML_OP_DIAG:
|
|
1895
1826
|
{
|
|
1896
1827
|
ggml_compute_forward_diag(params, tensor);
|
|
@@ -1967,6 +1898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
1967
1898
|
{
|
|
1968
1899
|
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
|
1969
1900
|
} break;
|
|
1901
|
+
case GGML_OP_ROLL:
|
|
1902
|
+
{
|
|
1903
|
+
ggml_compute_forward_roll(params, tensor);
|
|
1904
|
+
} break;
|
|
1970
1905
|
case GGML_OP_ARANGE:
|
|
1971
1906
|
{
|
|
1972
1907
|
ggml_compute_forward_arange(params, tensor);
|
|
@@ -2240,6 +2175,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2240
2175
|
n_tasks = n_threads;
|
|
2241
2176
|
} break;
|
|
2242
2177
|
case GGML_OP_GET_ROWS:
|
|
2178
|
+
case GGML_OP_SET_ROWS:
|
|
2243
2179
|
{
|
|
2244
2180
|
// FIXME: get_rows can use additional threads, but the cost of launching additional threads
|
|
2245
2181
|
// decreases performance with GPU offloading
|
|
@@ -2291,6 +2227,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
|
2291
2227
|
case GGML_OP_UPSCALE:
|
|
2292
2228
|
case GGML_OP_PAD:
|
|
2293
2229
|
case GGML_OP_PAD_REFLECT_1D:
|
|
2230
|
+
case GGML_OP_ROLL:
|
|
2294
2231
|
case GGML_OP_ARANGE:
|
|
2295
2232
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
2296
2233
|
case GGML_OP_ARGSORT:
|
|
@@ -3193,6 +3130,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
|
|
3193
3130
|
return ggml_graph_compute(cgraph, &cplan);
|
|
3194
3131
|
}
|
|
3195
3132
|
|
|
3133
|
+
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
|
|
3134
|
+
memcpy(y, x, n * sizeof(float));
|
|
3135
|
+
}
|
|
3136
|
+
|
|
3196
3137
|
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
3197
3138
|
int64_t i = 0;
|
|
3198
3139
|
#if defined(__F16C__)
|
|
@@ -3213,9 +3154,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
|
|
3213
3154
|
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
|
3214
3155
|
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
|
3215
3156
|
}
|
|
3157
|
+
#elif defined(__NNPA__)
|
|
3158
|
+
for (; i + 7 < n; i += 8) {
|
|
3159
|
+
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
|
3160
|
+
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
|
3161
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
|
3162
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3163
|
+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3164
|
+
}
|
|
3165
|
+
for (; i + 3 < n; i += 4) {
|
|
3166
|
+
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
|
3167
|
+
float32x4_t v_zero = vec_splats(0.0f);
|
|
3168
|
+
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
|
3169
|
+
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
|
3170
|
+
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
|
3171
|
+
}
|
|
3216
3172
|
#endif
|
|
3217
3173
|
for (; i < n; ++i) {
|
|
3218
|
-
y[i] =
|
|
3174
|
+
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
3219
3175
|
}
|
|
3220
3176
|
}
|
|
3221
3177
|
|
|
@@ -3239,9 +3195,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3239
3195
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3240
3196
|
_mm_storeu_ps(y + i, y_vec);
|
|
3241
3197
|
}
|
|
3198
|
+
#elif defined(__NNPA__)
|
|
3199
|
+
for (; i + 7 < n; i += 8) {
|
|
3200
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3201
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3202
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3203
|
+
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
|
3204
|
+
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
|
3205
|
+
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
|
3206
|
+
}
|
|
3207
|
+
for (; i + 3 < n; i += 4) {
|
|
3208
|
+
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
|
3209
|
+
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
|
3210
|
+
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
|
3211
|
+
vec_xst(v_yh, 0, (float *)(y + i));
|
|
3212
|
+
}
|
|
3242
3213
|
#endif
|
|
3214
|
+
|
|
3243
3215
|
for (; i < n; ++i) {
|
|
3244
|
-
y[i] =
|
|
3216
|
+
y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
3245
3217
|
}
|
|
3246
3218
|
}
|
|
3247
3219
|
|
|
@@ -3441,9 +3413,17 @@ int ggml_cpu_has_vxe(void) {
|
|
|
3441
3413
|
#endif
|
|
3442
3414
|
}
|
|
3443
3415
|
|
|
3416
|
+
int ggml_cpu_has_nnpa(void) {
|
|
3417
|
+
#if defined(GGML_NNPA)
|
|
3418
|
+
return 1;
|
|
3419
|
+
#else
|
|
3420
|
+
return 0;
|
|
3421
|
+
#endif
|
|
3422
|
+
}
|
|
3423
|
+
|
|
3444
3424
|
int ggml_cpu_has_neon(void) {
|
|
3445
3425
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
3446
|
-
return
|
|
3426
|
+
return 1;
|
|
3447
3427
|
#else
|
|
3448
3428
|
return 0;
|
|
3449
3429
|
#endif
|
|
@@ -3451,7 +3431,7 @@ int ggml_cpu_has_neon(void) {
|
|
|
3451
3431
|
|
|
3452
3432
|
int ggml_cpu_has_dotprod(void) {
|
|
3453
3433
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
|
3454
|
-
return
|
|
3434
|
+
return 1;
|
|
3455
3435
|
#else
|
|
3456
3436
|
return 0;
|
|
3457
3437
|
#endif
|
|
@@ -3459,7 +3439,7 @@ int ggml_cpu_has_dotprod(void) {
|
|
|
3459
3439
|
|
|
3460
3440
|
int ggml_cpu_has_sve(void) {
|
|
3461
3441
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
|
3462
|
-
return
|
|
3442
|
+
return 1;
|
|
3463
3443
|
#else
|
|
3464
3444
|
return 0;
|
|
3465
3445
|
#endif
|
|
@@ -3467,7 +3447,7 @@ int ggml_cpu_has_sve(void) {
|
|
|
3467
3447
|
|
|
3468
3448
|
int ggml_cpu_has_matmul_int8(void) {
|
|
3469
3449
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
3470
|
-
return
|
|
3450
|
+
return 1;
|
|
3471
3451
|
#else
|
|
3472
3452
|
return 0;
|
|
3473
3453
|
#endif
|
|
@@ -3483,14 +3463,14 @@ int ggml_cpu_get_sve_cnt(void) {
|
|
|
3483
3463
|
|
|
3484
3464
|
int ggml_cpu_has_sme(void) {
|
|
3485
3465
|
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
|
|
3486
|
-
return
|
|
3466
|
+
return 1;
|
|
3487
3467
|
#else
|
|
3488
3468
|
return 0;
|
|
3489
3469
|
#endif
|
|
3490
3470
|
}
|
|
3491
3471
|
|
|
3492
3472
|
void ggml_cpu_init(void) {
|
|
3493
|
-
// needed to initialize
|
|
3473
|
+
// needed to initialize ggml_time
|
|
3494
3474
|
{
|
|
3495
3475
|
struct ggml_init_params params = { 0, NULL, false };
|
|
3496
3476
|
struct ggml_context * ctx = ggml_init(params);
|
|
@@ -3511,9 +3491,10 @@ void ggml_cpu_init(void) {
|
|
|
3511
3491
|
uint16_t u16;
|
|
3512
3492
|
ggml_fp16_t fp16;
|
|
3513
3493
|
} u = {i};
|
|
3514
|
-
float f =
|
|
3515
|
-
|
|
3516
|
-
|
|
3494
|
+
float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
|
3495
|
+
ggml_table_f32_f16[i] = f;
|
|
3496
|
+
ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
|
|
3497
|
+
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
|
3517
3498
|
}
|
|
3518
3499
|
|
|
3519
3500
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
@@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
|
416
416
|
|
|
417
417
|
switch (op->op) {
|
|
418
418
|
case GGML_OP_CPY:
|
|
419
|
+
case GGML_OP_SET_ROWS:
|
|
419
420
|
return
|
|
420
421
|
op->type != GGML_TYPE_IQ3_XXS &&
|
|
421
422
|
op->type != GGML_TYPE_IQ3_S &&
|
|
@@ -578,6 +579,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|
|
578
579
|
if (ggml_cpu_has_vxe()) {
|
|
579
580
|
features.push_back({ "VXE", "1" });
|
|
580
581
|
}
|
|
582
|
+
if (ggml_cpu_has_nnpa()) {
|
|
583
|
+
features.push_back({ "NNPA", "1" });
|
|
584
|
+
}
|
|
581
585
|
if (ggml_cpu_has_wasm_simd()) {
|
|
582
586
|
features.push_back({ "WASM_SIMD", "1" });
|
|
583
587
|
}
|
|
@@ -52,6 +52,7 @@
|
|
|
52
52
|
#include "ggml-impl.h"
|
|
53
53
|
#include "ggml-cpu-impl.h"
|
|
54
54
|
#include "ggml-quants.h"
|
|
55
|
+
#include "simd-mappings.h"
|
|
55
56
|
|
|
56
57
|
#include <array>
|
|
57
58
|
#include <type_traits>
|
|
@@ -73,7 +74,7 @@
|
|
|
73
74
|
namespace {
|
|
74
75
|
|
|
75
76
|
inline float unhalf(ggml_fp16_t d) {
|
|
76
|
-
return
|
|
77
|
+
return GGML_CPU_FP16_TO_FP32(d);
|
|
77
78
|
}
|
|
78
79
|
|
|
79
80
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
@@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
|
|
|
252
253
|
float tmp[4];
|
|
253
254
|
|
|
254
255
|
for (int i = 0; i < 4; i++) {
|
|
255
|
-
tmp[i] =
|
|
256
|
+
tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
|
|
256
257
|
}
|
|
257
258
|
|
|
258
259
|
return vec_xl(0, (const float *)(tmp));
|