@fugood/llama.node 1.4.8 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +7 -7
- package/src/LlamaContext.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +107 -31
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
|
@@ -786,6 +786,133 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
|
|
786
786
|
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
787
787
|
}
|
|
788
788
|
|
|
789
|
+
void ggml_gemv_q8_0_4x4_q8_0(int n,
|
|
790
|
+
float * GGML_RESTRICT s,
|
|
791
|
+
size_t bs,
|
|
792
|
+
const void * GGML_RESTRICT vx,
|
|
793
|
+
const void * GGML_RESTRICT vy,
|
|
794
|
+
int nr,
|
|
795
|
+
int nc) {
|
|
796
|
+
const int qk = QK8_0;
|
|
797
|
+
const int nb = n / qk;
|
|
798
|
+
const int ncols_interleaved = 4;
|
|
799
|
+
const int blocklen = 4;
|
|
800
|
+
|
|
801
|
+
assert(n % qk == 0);
|
|
802
|
+
assert(nc % ncols_interleaved == 0);
|
|
803
|
+
|
|
804
|
+
UNUSED(nb);
|
|
805
|
+
UNUSED(ncols_interleaved);
|
|
806
|
+
UNUSED(blocklen);
|
|
807
|
+
|
|
808
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
809
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
|
|
810
|
+
|
|
811
|
+
for (int c = 0; c < nc; c += ncols_interleaved) {
|
|
812
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
813
|
+
float32x4_t acc = vdupq_n_f32(0);
|
|
814
|
+
for (int b = 0; b < nb; b++) {
|
|
815
|
+
int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
|
|
816
|
+
int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
|
|
817
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
|
818
|
+
|
|
819
|
+
int8x16x2_t a = vld1q_s8_x2(a_ptr->qs);
|
|
820
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
|
821
|
+
|
|
822
|
+
int32x4_t ret = vdupq_n_s32(0);
|
|
823
|
+
|
|
824
|
+
ret = vdotq_laneq_s32(ret, b_low.val[0], a.val[0], 0);
|
|
825
|
+
ret = vdotq_laneq_s32(ret, b_low.val[1], a.val[0], 1);
|
|
826
|
+
ret = vdotq_laneq_s32(ret, b_low.val[2], a.val[0], 2);
|
|
827
|
+
ret = vdotq_laneq_s32(ret, b_low.val[3], a.val[0], 3);
|
|
828
|
+
|
|
829
|
+
ret = vdotq_laneq_s32(ret, b_high.val[0], a.val[1], 0);
|
|
830
|
+
ret = vdotq_laneq_s32(ret, b_high.val[1], a.val[1], 1);
|
|
831
|
+
ret = vdotq_laneq_s32(ret, b_high.val[2], a.val[1], 2);
|
|
832
|
+
ret = vdotq_laneq_s32(ret, b_high.val[3], a.val[1], 3);
|
|
833
|
+
|
|
834
|
+
acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
|
835
|
+
a_ptr++;
|
|
836
|
+
b_ptr++;
|
|
837
|
+
}
|
|
838
|
+
vst1q_f32(s, acc);
|
|
839
|
+
s += ncols_interleaved;
|
|
840
|
+
}
|
|
841
|
+
return;
|
|
842
|
+
|
|
843
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
844
|
+
ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
845
|
+
}
|
|
846
|
+
|
|
847
|
+
void ggml_gemv_q8_0_4x8_q8_0(int n,
|
|
848
|
+
float * GGML_RESTRICT s,
|
|
849
|
+
size_t bs,
|
|
850
|
+
const void * GGML_RESTRICT vx,
|
|
851
|
+
const void * GGML_RESTRICT vy,
|
|
852
|
+
int nr,
|
|
853
|
+
int nc) {
|
|
854
|
+
const int qk = QK8_0;
|
|
855
|
+
const int nb = n / qk;
|
|
856
|
+
const int ncols_interleaved = 4;
|
|
857
|
+
const int blocklen = 8;
|
|
858
|
+
|
|
859
|
+
assert(n % qk == 0);
|
|
860
|
+
assert(nc % ncols_interleaved == 0);
|
|
861
|
+
|
|
862
|
+
UNUSED(nb);
|
|
863
|
+
UNUSED(ncols_interleaved);
|
|
864
|
+
UNUSED(blocklen);
|
|
865
|
+
|
|
866
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
867
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
|
|
868
|
+
|
|
869
|
+
for (int c = 0; c < nc; c += ncols_interleaved) {
|
|
870
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
871
|
+
float32x4_t acc = vdupq_n_f32(0);
|
|
872
|
+
|
|
873
|
+
for (int b = 0; b < nb; b++) {
|
|
874
|
+
int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
|
|
875
|
+
int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
|
|
876
|
+
float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
|
|
877
|
+
|
|
878
|
+
int8x8x4_t a_chunks = vld1_s8_x4(a_ptr->qs);
|
|
879
|
+
int8x16_t a0 = vcombine_s8(a_chunks.val[0], a_chunks.val[0]);
|
|
880
|
+
int8x16_t a1 = vcombine_s8(a_chunks.val[1], a_chunks.val[1]);
|
|
881
|
+
int8x16_t a2 = vcombine_s8(a_chunks.val[2], a_chunks.val[2]);
|
|
882
|
+
int8x16_t a3 = vcombine_s8(a_chunks.val[3], a_chunks.val[3]);
|
|
883
|
+
float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
|
|
884
|
+
|
|
885
|
+
int32x4_t ret0 = vdupq_n_s32(0);
|
|
886
|
+
int32x4_t ret1 = vdupq_n_s32(0);
|
|
887
|
+
|
|
888
|
+
// 0..7
|
|
889
|
+
ret0 = vdotq_s32(ret0, b_low.val[0], a0);
|
|
890
|
+
ret1 = vdotq_s32(ret1, b_low.val[1], a0);
|
|
891
|
+
// 8..15
|
|
892
|
+
ret0 = vdotq_s32(ret0, b_low.val[2], a1);
|
|
893
|
+
ret1 = vdotq_s32(ret1, b_low.val[3], a1);
|
|
894
|
+
// 16..23
|
|
895
|
+
ret0 = vdotq_s32(ret0, b_high.val[0], a2);
|
|
896
|
+
ret1 = vdotq_s32(ret1, b_high.val[1], a2);
|
|
897
|
+
// 24..31
|
|
898
|
+
ret0 = vdotq_s32(ret0, b_high.val[2], a3);
|
|
899
|
+
ret1 = vdotq_s32(ret1, b_high.val[3], a3);
|
|
900
|
+
|
|
901
|
+
int32x4_t ret = vpaddq_s32(ret0, ret1);
|
|
902
|
+
|
|
903
|
+
acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
|
|
904
|
+
a_ptr++;
|
|
905
|
+
b_ptr++;
|
|
906
|
+
}
|
|
907
|
+
vst1q_f32(s, acc);
|
|
908
|
+
s += ncols_interleaved;
|
|
909
|
+
}
|
|
910
|
+
return;
|
|
911
|
+
|
|
912
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
913
|
+
ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
914
|
+
}
|
|
915
|
+
|
|
789
916
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
790
917
|
const int qk = QK8_0;
|
|
791
918
|
const int nb = n / qk;
|
|
@@ -2610,3 +2737,159 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,
|
|
|
2610
2737
|
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2611
2738
|
ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
|
|
2612
2739
|
}
|
|
2740
|
+
|
|
2741
|
+
|
|
2742
|
+
void ggml_gemm_q8_0_4x4_q8_0(int n,
|
|
2743
|
+
float * GGML_RESTRICT s,
|
|
2744
|
+
size_t bs,
|
|
2745
|
+
const void * GGML_RESTRICT vx,
|
|
2746
|
+
const void * GGML_RESTRICT vy,
|
|
2747
|
+
int nr,
|
|
2748
|
+
int nc) {
|
|
2749
|
+
const int qk = QK8_0;
|
|
2750
|
+
const int nb = n / qk;
|
|
2751
|
+
const int ncols_interleaved = 4;
|
|
2752
|
+
const int blocklen = 4;
|
|
2753
|
+
|
|
2754
|
+
assert(n % qk == 0);
|
|
2755
|
+
assert(nr % 4 == 0);
|
|
2756
|
+
assert(nc % ncols_interleaved == 0);
|
|
2757
|
+
|
|
2758
|
+
UNUSED(nb);
|
|
2759
|
+
UNUSED(ncols_interleaved);
|
|
2760
|
+
UNUSED(blocklen);
|
|
2761
|
+
|
|
2762
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
2763
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
2764
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
2765
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2766
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
2767
|
+
|
|
2768
|
+
float32x4_t sumf[4];
|
|
2769
|
+
for (int m = 0; m < 4; m++) {
|
|
2770
|
+
sumf[m] = vdupq_n_f32(0);
|
|
2771
|
+
}
|
|
2772
|
+
|
|
2773
|
+
for (int l = 0; l < nb; l++) {
|
|
2774
|
+
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *) a_ptr[l].d));
|
|
2775
|
+
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *) b_ptr[l].d));
|
|
2776
|
+
|
|
2777
|
+
int32x4_t sumi_0 = vdupq_n_s32(0);
|
|
2778
|
+
int32x4_t sumi_1 = vdupq_n_s32(0);
|
|
2779
|
+
int32x4_t sumi_2 = vdupq_n_s32(0);
|
|
2780
|
+
int32x4_t sumi_3 = vdupq_n_s32(0);
|
|
2781
|
+
|
|
2782
|
+
for (int k_group = 0; k_group < 8; k_group += 4) {
|
|
2783
|
+
int8x16x4_t a = vld1q_s8_x4(a_ptr[l].qs + 16 * k_group);
|
|
2784
|
+
int8x16x4_t b = vld1q_s8_x4(b_ptr[l].qs + 16 * k_group);
|
|
2785
|
+
|
|
2786
|
+
for (int k = 0; k < 4; k++) {
|
|
2787
|
+
sumi_0 = vdotq_laneq_s32(sumi_0, b.val[k], a.val[k], 0);
|
|
2788
|
+
sumi_1 = vdotq_laneq_s32(sumi_1, b.val[k], a.val[k], 1);
|
|
2789
|
+
sumi_2 = vdotq_laneq_s32(sumi_2, b.val[k], a.val[k], 2);
|
|
2790
|
+
sumi_3 = vdotq_laneq_s32(sumi_3, b.val[k], a.val[k], 3);
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
|
|
2794
|
+
sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
|
|
2795
|
+
sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
|
|
2796
|
+
sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
|
|
2797
|
+
sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
for (int m = 0; m < 4; m++) {
|
|
2801
|
+
vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
|
|
2802
|
+
}
|
|
2803
|
+
}
|
|
2804
|
+
}
|
|
2805
|
+
return;
|
|
2806
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
2807
|
+
ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
2808
|
+
}
|
|
2809
|
+
|
|
2810
|
+
void ggml_gemm_q8_0_4x8_q8_0(int n,
|
|
2811
|
+
float * GGML_RESTRICT s,
|
|
2812
|
+
size_t bs,
|
|
2813
|
+
const void * GGML_RESTRICT vx,
|
|
2814
|
+
const void * GGML_RESTRICT vy,
|
|
2815
|
+
int nr,
|
|
2816
|
+
int nc) {
|
|
2817
|
+
const int qk = QK8_0;
|
|
2818
|
+
const int nb = n / qk;
|
|
2819
|
+
const int ncols_interleaved = 4;
|
|
2820
|
+
const int blocklen = 8;
|
|
2821
|
+
|
|
2822
|
+
assert(n % qk == 0);
|
|
2823
|
+
assert(nr % 4 == 0);
|
|
2824
|
+
assert(nc % ncols_interleaved == 0);
|
|
2825
|
+
|
|
2826
|
+
UNUSED(nb);
|
|
2827
|
+
UNUSED(ncols_interleaved);
|
|
2828
|
+
UNUSED(blocklen);
|
|
2829
|
+
|
|
2830
|
+
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2831
|
+
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
|
|
2832
|
+
|
|
2833
|
+
for (int y = 0; y < nr; y += 4) {
|
|
2834
|
+
const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
|
|
2835
|
+
|
|
2836
|
+
for (int x = 0; x < nc; x += ncols_interleaved) {
|
|
2837
|
+
const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
|
|
2838
|
+
const block_q8_0x4 * a_ptr = a_ptr_base;
|
|
2839
|
+
|
|
2840
|
+
float32x4_t acc_f32[4];
|
|
2841
|
+
for (int i = 0; i < 4; i++) {
|
|
2842
|
+
acc_f32[i] = vdupq_n_f32(0);
|
|
2843
|
+
}
|
|
2844
|
+
|
|
2845
|
+
for (int b = 0; b < nb; b++) {
|
|
2846
|
+
int32x4_t acc[4];
|
|
2847
|
+
for (int i = 0; i < 4; i++) {
|
|
2848
|
+
acc[i] = vdupq_n_s32(0);
|
|
2849
|
+
}
|
|
2850
|
+
|
|
2851
|
+
// Process 4 chunks of 8 positions each
|
|
2852
|
+
for (int chunk = 0; chunk < 4; chunk++) {
|
|
2853
|
+
int8x16_t a01 = vld1q_s8(a_ptr->qs + chunk * 32);
|
|
2854
|
+
int8x16_t a23 = vld1q_s8(a_ptr->qs + chunk * 32 + 16);
|
|
2855
|
+
int8x16_t b01 = vld1q_s8(b_ptr->qs + chunk * 32);
|
|
2856
|
+
int8x16_t b23 = vld1q_s8(b_ptr->qs + chunk * 32 + 16);
|
|
2857
|
+
|
|
2858
|
+
acc[0] = vmmlaq_s32(acc[0], a01, b01);
|
|
2859
|
+
acc[1] = vmmlaq_s32(acc[1], a01, b23);
|
|
2860
|
+
acc[2] = vmmlaq_s32(acc[2], a23, b01);
|
|
2861
|
+
acc[3] = vmmlaq_s32(acc[3], a23, b23);
|
|
2862
|
+
}
|
|
2863
|
+
|
|
2864
|
+
// Reorder outputs from 2×2 tiles to row-major
|
|
2865
|
+
// acc[0] = [r0c0, r0c1, r1c0, r1c1]
|
|
2866
|
+
// acc[1] = [r0c2, r0c3, r1c2, r1c3]
|
|
2867
|
+
// acc[2] = [r2c0, r2c1, r3c0, r3c1]
|
|
2868
|
+
// acc[3] = [r2c2, r2c3, r3c2, r3c3]
|
|
2869
|
+
int32x4_t row0 = vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1]));
|
|
2870
|
+
int32x4_t row1 = vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1]));
|
|
2871
|
+
int32x4_t row2 = vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3]));
|
|
2872
|
+
int32x4_t row3 = vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3]));
|
|
2873
|
+
|
|
2874
|
+
// Scales
|
|
2875
|
+
float32x4_t a_d = vcvt_f32_f16(vld1_f16((const __fp16 *) a_ptr->d));
|
|
2876
|
+
float32x4_t b_d = vcvt_f32_f16(vld1_f16((const __fp16 *) b_ptr->d));
|
|
2877
|
+
|
|
2878
|
+
acc_f32[0] = vfmaq_f32(acc_f32[0], vcvtq_f32_s32(row0), vmulq_laneq_f32(b_d, a_d, 0));
|
|
2879
|
+
acc_f32[1] = vfmaq_f32(acc_f32[1], vcvtq_f32_s32(row1), vmulq_laneq_f32(b_d, a_d, 1));
|
|
2880
|
+
acc_f32[2] = vfmaq_f32(acc_f32[2], vcvtq_f32_s32(row2), vmulq_laneq_f32(b_d, a_d, 2));
|
|
2881
|
+
acc_f32[3] = vfmaq_f32(acc_f32[3], vcvtq_f32_s32(row3), vmulq_laneq_f32(b_d, a_d, 3));
|
|
2882
|
+
|
|
2883
|
+
a_ptr++;
|
|
2884
|
+
b_ptr++;
|
|
2885
|
+
}
|
|
2886
|
+
|
|
2887
|
+
for (int row = 0; row < 4; row++) {
|
|
2888
|
+
vst1q_f32(s + (y + row) * bs + x, acc_f32[row]);
|
|
2889
|
+
}
|
|
2890
|
+
}
|
|
2891
|
+
}
|
|
2892
|
+
return;
|
|
2893
|
+
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2894
|
+
ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
2895
|
+
}
|
|
@@ -43,6 +43,8 @@
|
|
|
43
43
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
44
44
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
45
45
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
46
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
47
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
46
48
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
47
49
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
48
50
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
@@ -51,6 +53,8 @@
|
|
|
51
53
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
52
54
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
53
55
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
56
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
57
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
54
58
|
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
|
55
59
|
// repack.cpp
|
|
56
60
|
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
|
|
@@ -67,10 +71,14 @@
|
|
|
67
71
|
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
|
68
72
|
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
|
|
69
73
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
74
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
75
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
70
76
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
71
77
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
72
78
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
73
79
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
80
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
81
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
74
82
|
#elif defined(__POWERPC__) || defined(__powerpc__)
|
|
75
83
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
|
76
84
|
// quants.c
|
|
@@ -91,6 +99,8 @@
|
|
|
91
99
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
92
100
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
93
101
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
102
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
103
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
94
104
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
95
105
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
96
106
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
@@ -99,6 +109,8 @@
|
|
|
99
109
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
100
110
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
101
111
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
112
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
113
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
102
114
|
#elif defined(__loongarch64)
|
|
103
115
|
// quants.c
|
|
104
116
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
|
@@ -119,6 +131,8 @@
|
|
|
119
131
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
120
132
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
121
133
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
134
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
135
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
122
136
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
123
137
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
124
138
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
@@ -127,6 +141,8 @@
|
|
|
127
141
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
128
142
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
129
143
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
144
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
145
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
130
146
|
#elif defined(__riscv)
|
|
131
147
|
// quants.c
|
|
132
148
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
|
@@ -154,6 +170,8 @@
|
|
|
154
170
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
155
171
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
156
172
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
173
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
174
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
157
175
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
158
176
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
159
177
|
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
|
|
@@ -161,6 +179,8 @@
|
|
|
161
179
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
162
180
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
163
181
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
182
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
183
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
164
184
|
#elif defined(__s390x__)
|
|
165
185
|
// quants.c
|
|
166
186
|
#define quantize_row_q8_K_generic quantize_row_q8_K
|
|
@@ -187,6 +207,8 @@
|
|
|
187
207
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
188
208
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
189
209
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
210
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
211
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
190
212
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
191
213
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
192
214
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
@@ -195,6 +217,8 @@
|
|
|
195
217
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
196
218
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
197
219
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
220
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
221
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
198
222
|
#elif defined(__wasm__)
|
|
199
223
|
// quants.c
|
|
200
224
|
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
|
@@ -223,6 +247,8 @@
|
|
|
223
247
|
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
|
224
248
|
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
|
225
249
|
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
|
250
|
+
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
|
|
251
|
+
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
|
|
226
252
|
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
|
227
253
|
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
|
228
254
|
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
|
@@ -231,4 +257,6 @@
|
|
|
231
257
|
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
|
232
258
|
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
|
233
259
|
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
|
260
|
+
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
|
|
261
|
+
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
|
|
234
262
|
#endif
|
|
@@ -3320,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
|
|
3320
3320
|
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
|
3321
3321
|
_mm_storeu_ps(y + i, y_vec);
|
|
3322
3322
|
}
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3323
|
+
|
|
3324
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
|
|
3325
|
+
// calculate step size
|
|
3326
|
+
const int epr = __riscv_vsetvlmax_e16m2();
|
|
3327
|
+
const int step = epr * 2;
|
|
3328
|
+
const int np = (n & ~(step - 1));
|
|
3329
|
+
|
|
3330
|
+
// unroll by 2
|
|
3331
|
+
for (; i < np; i += step) {
|
|
3332
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
|
|
3333
|
+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
|
|
3334
|
+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
|
|
3335
|
+
|
|
3336
|
+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
|
|
3337
|
+
vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
|
|
3338
|
+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
|
|
3329
3339
|
}
|
|
3340
|
+
|
|
3341
|
+
// leftovers
|
|
3342
|
+
int vl;
|
|
3343
|
+
for (i = np; i < n; i += vl) {
|
|
3344
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
3345
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
|
|
3346
|
+
vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
|
|
3347
|
+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
|
|
3348
|
+
}
|
|
3349
|
+
|
|
3330
3350
|
#endif
|
|
3331
3351
|
|
|
3332
3352
|
for (; i < n; ++i) {
|
|
@@ -3371,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
|
|
|
3371
3391
|
(const __m128i *)(x + i))),
|
|
3372
3392
|
16)));
|
|
3373
3393
|
}
|
|
3394
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
|
|
3395
|
+
// calculate step size
|
|
3396
|
+
const int epr = __riscv_vsetvlmax_e16m2();
|
|
3397
|
+
const int step = epr * 2;
|
|
3398
|
+
const int np = (n & ~(step - 1));
|
|
3399
|
+
|
|
3400
|
+
// unroll by 2
|
|
3401
|
+
for (; i < np; i += step) {
|
|
3402
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
|
|
3403
|
+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
|
|
3404
|
+
__riscv_vse32_v_f32m4(y + i, ay0, epr);
|
|
3405
|
+
|
|
3406
|
+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
|
|
3407
|
+
vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
|
|
3408
|
+
__riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
|
|
3409
|
+
}
|
|
3410
|
+
|
|
3411
|
+
// leftovers
|
|
3412
|
+
int vl;
|
|
3413
|
+
for (i = np; i < n; i += vl) {
|
|
3414
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
3415
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
|
|
3416
|
+
vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
|
|
3417
|
+
__riscv_vse32_v_f32m4(y + i, ay0, vl);
|
|
3418
|
+
}
|
|
3374
3419
|
#endif
|
|
3375
3420
|
for (; i < n; i++) {
|
|
3376
3421
|
y[i] = GGML_BF16_TO_FP32(x[i]);
|