@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "ggml-common.h"
|
|
3
3
|
|
|
4
4
|
#include "ggml-cpu-impl.h"
|
|
5
|
+
#include "simd-mappings.h"
|
|
5
6
|
#include "ggml-quants.h"
|
|
6
7
|
#include "quants.h"
|
|
7
8
|
|
|
@@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
int sumi = sumi0 + sumi1;
|
|
140
|
-
sumf += sumi*
|
|
141
|
+
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
141
142
|
}
|
|
142
143
|
|
|
143
144
|
*s = sumf;
|
|
@@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
174
175
|
}
|
|
175
176
|
|
|
176
177
|
int sumi = sumi0 + sumi1;
|
|
177
|
-
sumf += (
|
|
178
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
178
179
|
}
|
|
179
180
|
|
|
180
181
|
*s = sumf;
|
|
@@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
217
218
|
}
|
|
218
219
|
|
|
219
220
|
int sumi = sumi0 + sumi1;
|
|
220
|
-
sumf += (
|
|
221
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
221
222
|
}
|
|
222
223
|
|
|
223
224
|
*s = sumf;
|
|
@@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
260
261
|
}
|
|
261
262
|
|
|
262
263
|
int sumi = sumi0 + sumi1;
|
|
263
|
-
sumf += (
|
|
264
|
+
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
264
265
|
}
|
|
265
266
|
|
|
266
267
|
*s = sumf;
|
|
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
290
291
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
291
292
|
}
|
|
292
293
|
|
|
293
|
-
sumf += sumi*(
|
|
294
|
+
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
294
295
|
}
|
|
295
296
|
|
|
296
297
|
*s = sumf;
|
|
@@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
342
343
|
}
|
|
343
344
|
}
|
|
344
345
|
|
|
345
|
-
sumf += (float) sum * (
|
|
346
|
+
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
*s = sumf;
|
|
@@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
372
373
|
}
|
|
373
374
|
}
|
|
374
375
|
|
|
375
|
-
const float d = y[i].d *
|
|
376
|
+
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
376
377
|
|
|
377
378
|
sumf += (float) sumi * d;
|
|
378
379
|
}
|
|
@@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
405
406
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
406
407
|
}
|
|
407
408
|
|
|
408
|
-
const float dall = y[i].d *
|
|
409
|
-
const float dmin = y[i].d *
|
|
409
|
+
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
410
|
+
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
410
411
|
|
|
411
412
|
int isum = 0;
|
|
412
413
|
int is = 0;
|
|
@@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
504
505
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
505
506
|
q8 += 8; a += 8;
|
|
506
507
|
}
|
|
507
|
-
const float d =
|
|
508
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
508
509
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
509
510
|
}
|
|
510
511
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
577
578
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
578
579
|
q8 += 8; a += 8;
|
|
579
580
|
}
|
|
580
|
-
const float d =
|
|
581
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
581
582
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
582
|
-
const float dmin =
|
|
583
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
583
584
|
sumf -= dmin * sumi;
|
|
584
585
|
}
|
|
585
586
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
657
658
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
658
659
|
q8 += 8; a += 8;
|
|
659
660
|
}
|
|
660
|
-
const float d =
|
|
661
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
661
662
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
662
|
-
const float dmin =
|
|
663
|
+
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
663
664
|
sumf -= dmin * sumi;
|
|
664
665
|
}
|
|
665
666
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
|
|
714
715
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
715
716
|
q8 += 8; a += 8;
|
|
716
717
|
}
|
|
717
|
-
const float d =
|
|
718
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
718
719
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
719
720
|
}
|
|
720
721
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
739
740
|
|
|
740
741
|
float sumf = 0.f;
|
|
741
742
|
for (int i = 0; i < nb; ++i) {
|
|
742
|
-
const float d =
|
|
743
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
743
744
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
744
745
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
745
746
|
int32_t bsum = 0;
|
|
@@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
778
779
|
|
|
779
780
|
float sumf = 0.f;
|
|
780
781
|
for (int i = 0; i < nb; ++i) {
|
|
781
|
-
const float d =
|
|
782
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
782
783
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
783
784
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
784
785
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
829
830
|
float sumf = 0;
|
|
830
831
|
for (int i = 0; i < nb; i++) {
|
|
831
832
|
|
|
832
|
-
const float d =
|
|
833
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
833
834
|
const int8_t * q8 = y[i].qs;
|
|
834
835
|
const uint8_t * qs = x[i].qs;
|
|
835
836
|
const uint8_t * qh = x[i].qh;
|
|
@@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
882
883
|
|
|
883
884
|
float sumf = 0.f;
|
|
884
885
|
for (int i = 0; i < nb; ++i) {
|
|
885
|
-
const float d =
|
|
886
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
886
887
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
887
888
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
888
889
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
924
925
|
|
|
925
926
|
float sumf = 0.f;
|
|
926
927
|
for (int i = 0; i < nb; ++i) {
|
|
927
|
-
const float d =
|
|
928
|
+
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
928
929
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
929
930
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
930
931
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
@@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1002
1003
|
qs += 4;
|
|
1003
1004
|
}
|
|
1004
1005
|
|
|
1005
|
-
sumf +=
|
|
1006
|
+
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
1006
1007
|
}
|
|
1007
1008
|
|
|
1008
1009
|
*s = sumf;
|
|
@@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1063
1064
|
qh += 2;
|
|
1064
1065
|
}
|
|
1065
1066
|
|
|
1066
|
-
sumf +=
|
|
1067
|
+
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
1067
1068
|
}
|
|
1068
1069
|
|
|
1069
1070
|
*s = sumf;
|
|
@@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1087
1088
|
float sumf = 0;
|
|
1088
1089
|
|
|
1089
1090
|
for (; ib < nb; ++ib) {
|
|
1090
|
-
const float d =
|
|
1091
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
1091
1092
|
int sumi1 = 0, sumi2 = 0;
|
|
1092
1093
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
1093
1094
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
1113
1114
|
|
|
1114
1115
|
float sumf = 0;
|
|
1115
1116
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
1116
|
-
const float d4d8 =
|
|
1117
|
+
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
1117
1118
|
uint16_t h = x[ibl].scales_h;
|
|
1118
1119
|
const uint8_t * qs = x[ibl].qs;
|
|
1119
1120
|
const int8_t * q8 = y[ibl].qs;
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include "arch-fallback.h"
|
|
@@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
72
73
|
const float d = amax / ((1 << 7) - 1);
|
|
73
74
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
74
75
|
|
|
75
|
-
y[i].d[row_iter] =
|
|
76
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
76
77
|
}
|
|
77
78
|
|
|
78
79
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
|
|
110
111
|
const float d = amax / ((1 << 7) - 1);
|
|
111
112
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
112
113
|
|
|
113
|
-
y[i].d[row_iter] =
|
|
114
|
+
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
236
237
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
237
238
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
238
239
|
}
|
|
239
|
-
sumf[j] += sumi *
|
|
240
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
240
241
|
}
|
|
241
242
|
}
|
|
242
243
|
}
|
|
@@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
280
281
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
281
282
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
282
283
|
}
|
|
283
|
-
sumf[j] += sumi *
|
|
284
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
284
285
|
}
|
|
285
286
|
}
|
|
286
287
|
}
|
|
@@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
325
326
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
326
327
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
327
328
|
}
|
|
328
|
-
sumf[j] += sumi *
|
|
329
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
329
330
|
}
|
|
330
331
|
}
|
|
331
332
|
}
|
|
@@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
396
397
|
sumi2 = sumi2 * scales_1[j];
|
|
397
398
|
sumi += sumi1 + sumi2;
|
|
398
399
|
}
|
|
399
|
-
sumf[j] += sumi *
|
|
400
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
400
401
|
}
|
|
401
402
|
}
|
|
402
403
|
for (int sb = 0; sb < 8; sb++) {
|
|
403
404
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
404
405
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
405
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
406
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
406
407
|
}
|
|
407
408
|
}
|
|
408
409
|
}
|
|
@@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
449
450
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
450
451
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
451
452
|
}
|
|
452
|
-
sumf[j] += sumi *
|
|
453
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
453
454
|
}
|
|
454
455
|
}
|
|
455
456
|
}
|
|
@@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
500
501
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
501
502
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
502
503
|
}
|
|
503
|
-
sumf[m][j] += sumi *
|
|
504
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
504
505
|
}
|
|
505
506
|
}
|
|
506
507
|
}
|
|
@@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
555
556
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
556
557
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
557
558
|
}
|
|
558
|
-
sumf[m][j] += sumi *
|
|
559
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
559
560
|
}
|
|
560
561
|
}
|
|
561
562
|
}
|
|
@@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
609
610
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
610
611
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
611
612
|
}
|
|
612
|
-
sumf[m][j] += sumi *
|
|
613
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
613
614
|
}
|
|
614
615
|
}
|
|
615
616
|
}
|
|
@@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
688
689
|
sumi2 = sumi2 * scales_1[j];
|
|
689
690
|
sumi += sumi1 + sumi2;
|
|
690
691
|
}
|
|
691
|
-
sumf[m][j] += sumi *
|
|
692
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
692
693
|
}
|
|
693
694
|
}
|
|
694
695
|
}
|
|
@@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
697
698
|
for(int m = 0; m < 4; m++) {
|
|
698
699
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
699
700
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
700
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
701
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
701
702
|
}
|
|
702
703
|
}
|
|
703
704
|
}
|
|
@@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
753
754
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
754
755
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
755
756
|
}
|
|
756
|
-
sumf[m][j] += sumi *
|
|
757
|
+
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
757
758
|
}
|
|
758
759
|
}
|
|
759
760
|
}
|
|
@@ -1163,13 +1164,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1163
1164
|
// not realy a GGML_TYPE_Q8_0 but same size.
|
|
1164
1165
|
switch (op->op) {
|
|
1165
1166
|
case GGML_OP_MUL_MAT:
|
|
1166
|
-
|
|
1167
|
-
|
|
1167
|
+
{
|
|
1168
|
+
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
|
1169
|
+
return true;
|
|
1170
|
+
}
|
|
1168
1171
|
case GGML_OP_MUL_MAT_ID:
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1172
|
+
{
|
|
1173
|
+
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
|
1174
|
+
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
|
1175
|
+
|
|
1176
|
+
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
|
1177
|
+
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
|
1178
|
+
|
|
1179
|
+
const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
|
|
1180
|
+
|
|
1181
|
+
size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
|
|
1182
|
+
|
|
1183
|
+
return true;
|
|
1184
|
+
}
|
|
1173
1185
|
default:
|
|
1174
1186
|
// GGML_ABORT("fatal error");
|
|
1175
1187
|
break;
|
|
@@ -1305,14 +1317,17 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1305
1317
|
int32_t i2;
|
|
1306
1318
|
};
|
|
1307
1319
|
|
|
1308
|
-
GGML_ASSERT(params->wsize >=
|
|
1309
|
-
|
|
1320
|
+
GGML_ASSERT(params->wsize >=
|
|
1321
|
+
(GGML_PAD(nbw3, sizeof(int64_t)) +
|
|
1322
|
+
n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
|
|
1323
|
+
);
|
|
1310
1324
|
|
|
1311
|
-
auto * wdata
|
|
1312
|
-
auto * wdata_src1_end
|
|
1313
|
-
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
|
1325
|
+
auto * wdata = (char *)params->wdata;
|
|
1326
|
+
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
|
1314
1327
|
|
|
1315
|
-
|
|
1328
|
+
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
|
|
1329
|
+
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
|
1330
|
+
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
|
1316
1331
|
|
|
1317
1332
|
// src1: float32 => param type
|
|
1318
1333
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
@@ -1397,44 +1412,45 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
|
|
1397
1412
|
}
|
|
1398
1413
|
};
|
|
1399
1414
|
|
|
1400
|
-
// instance for Q4
|
|
1401
|
-
static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
1402
|
-
static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
1403
|
-
static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
1404
|
-
static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1405
|
-
|
|
1406
|
-
// instance for IQ4
|
|
1407
|
-
static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1408
|
-
|
|
1409
1415
|
} // namespace ggml::cpu::repack
|
|
1410
1416
|
|
|
1411
1417
|
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
|
1418
|
+
|
|
1419
|
+
// instance for Q4
|
|
1420
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
|
1421
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
|
1422
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
|
1423
|
+
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
|
1424
|
+
|
|
1425
|
+
// instance for IQ4
|
|
1426
|
+
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
1427
|
+
|
|
1412
1428
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
1413
1429
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
|
1414
1430
|
if (cur->ne[1] % 8 == 0) {
|
|
1415
|
-
return &
|
|
1431
|
+
return &q4_0_8x8_q8_0;
|
|
1416
1432
|
}
|
|
1417
1433
|
}
|
|
1418
1434
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
1419
1435
|
if (cur->ne[1] % 4 == 0) {
|
|
1420
|
-
return &
|
|
1436
|
+
return &q4_0_4x8_q8_0;
|
|
1421
1437
|
}
|
|
1422
1438
|
}
|
|
1423
1439
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1424
1440
|
if (cur->ne[1] % 4 == 0) {
|
|
1425
|
-
return &
|
|
1441
|
+
return &q4_0_4x4_q8_0;
|
|
1426
1442
|
}
|
|
1427
1443
|
}
|
|
1428
1444
|
} else if (cur->type == GGML_TYPE_Q4_K) {
|
|
1429
1445
|
if (ggml_cpu_has_avx2()) {
|
|
1430
1446
|
if (cur->ne[1] % 8 == 0) {
|
|
1431
|
-
return &
|
|
1447
|
+
return &q4_K_8x8_q8_K;
|
|
1432
1448
|
}
|
|
1433
1449
|
}
|
|
1434
1450
|
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
|
1435
1451
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
1436
1452
|
if (cur->ne[1] % 4 == 0) {
|
|
1437
|
-
return &
|
|
1453
|
+
return &iq4_nl_4x4_q8_0;
|
|
1438
1454
|
}
|
|
1439
1455
|
}
|
|
1440
1456
|
}
|