@fugood/llama.node 1.4.8 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/lib/binding.ts +43 -0
  2. package/lib/parallel.js +26 -0
  3. package/lib/parallel.ts +33 -0
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +12 -14
  6. package/src/LlamaCompletionWorker.cpp +3 -1
  7. package/src/LlamaCompletionWorker.h +2 -0
  8. package/src/LlamaContext.cpp +16 -1
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +4 -4
  11. package/src/llama.cpp/common/arg.cpp +159 -42
  12. package/src/llama.cpp/common/arg.h +10 -1
  13. package/src/llama.cpp/common/common.cpp +1 -1
  14. package/src/llama.cpp/common/common.h +6 -2
  15. package/src/llama.cpp/common/preset.cpp +197 -5
  16. package/src/llama.cpp/common/preset.h +45 -3
  17. package/src/llama.cpp/common/sampling.cpp +51 -37
  18. package/src/llama.cpp/common/sampling.h +6 -3
  19. package/src/llama.cpp/common/speculative.cpp +1 -1
  20. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  31. package/src/llama.cpp/src/llama-mmap.h +5 -1
  32. package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
  33. package/src/llama.cpp/src/llama-model.cpp +7 -5
  34. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  35. package/src/llama.cpp/src/llama.cpp +22 -32
@@ -786,6 +786,133 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
786
786
  ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
787
787
  }
788
788
 
789
+ void ggml_gemv_q8_0_4x4_q8_0(int n,
790
+ float * GGML_RESTRICT s,
791
+ size_t bs,
792
+ const void * GGML_RESTRICT vx,
793
+ const void * GGML_RESTRICT vy,
794
+ int nr,
795
+ int nc) {
796
+ const int qk = QK8_0;
797
+ const int nb = n / qk;
798
+ const int ncols_interleaved = 4;
799
+ const int blocklen = 4;
800
+
801
+ assert(n % qk == 0);
802
+ assert(nc % ncols_interleaved == 0);
803
+
804
+ UNUSED(nb);
805
+ UNUSED(ncols_interleaved);
806
+ UNUSED(blocklen);
807
+
808
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
809
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
810
+
811
+ for (int c = 0; c < nc; c += ncols_interleaved) {
812
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
813
+ float32x4_t acc = vdupq_n_f32(0);
814
+ for (int b = 0; b < nb; b++) {
815
+ int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
816
+ int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
817
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
818
+
819
+ int8x16x2_t a = vld1q_s8_x2(a_ptr->qs);
820
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
821
+
822
+ int32x4_t ret = vdupq_n_s32(0);
823
+
824
+ ret = vdotq_laneq_s32(ret, b_low.val[0], a.val[0], 0);
825
+ ret = vdotq_laneq_s32(ret, b_low.val[1], a.val[0], 1);
826
+ ret = vdotq_laneq_s32(ret, b_low.val[2], a.val[0], 2);
827
+ ret = vdotq_laneq_s32(ret, b_low.val[3], a.val[0], 3);
828
+
829
+ ret = vdotq_laneq_s32(ret, b_high.val[0], a.val[1], 0);
830
+ ret = vdotq_laneq_s32(ret, b_high.val[1], a.val[1], 1);
831
+ ret = vdotq_laneq_s32(ret, b_high.val[2], a.val[1], 2);
832
+ ret = vdotq_laneq_s32(ret, b_high.val[3], a.val[1], 3);
833
+
834
+ acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
835
+ a_ptr++;
836
+ b_ptr++;
837
+ }
838
+ vst1q_f32(s, acc);
839
+ s += ncols_interleaved;
840
+ }
841
+ return;
842
+
843
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
844
+ ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
845
+ }
846
+
847
+ void ggml_gemv_q8_0_4x8_q8_0(int n,
848
+ float * GGML_RESTRICT s,
849
+ size_t bs,
850
+ const void * GGML_RESTRICT vx,
851
+ const void * GGML_RESTRICT vy,
852
+ int nr,
853
+ int nc) {
854
+ const int qk = QK8_0;
855
+ const int nb = n / qk;
856
+ const int ncols_interleaved = 4;
857
+ const int blocklen = 8;
858
+
859
+ assert(n % qk == 0);
860
+ assert(nc % ncols_interleaved == 0);
861
+
862
+ UNUSED(nb);
863
+ UNUSED(ncols_interleaved);
864
+ UNUSED(blocklen);
865
+
866
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
867
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
868
+
869
+ for (int c = 0; c < nc; c += ncols_interleaved) {
870
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
871
+ float32x4_t acc = vdupq_n_f32(0);
872
+
873
+ for (int b = 0; b < nb; b++) {
874
+ int8x16x4_t b_low = vld1q_s8_x4((const int8_t *) b_ptr->qs);
875
+ int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
876
+ float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
877
+
878
+ int8x8x4_t a_chunks = vld1_s8_x4(a_ptr->qs);
879
+ int8x16_t a0 = vcombine_s8(a_chunks.val[0], a_chunks.val[0]);
880
+ int8x16_t a1 = vcombine_s8(a_chunks.val[1], a_chunks.val[1]);
881
+ int8x16_t a2 = vcombine_s8(a_chunks.val[2], a_chunks.val[2]);
882
+ int8x16_t a3 = vcombine_s8(a_chunks.val[3], a_chunks.val[3]);
883
+ float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
884
+
885
+ int32x4_t ret0 = vdupq_n_s32(0);
886
+ int32x4_t ret1 = vdupq_n_s32(0);
887
+
888
+ // 0..7
889
+ ret0 = vdotq_s32(ret0, b_low.val[0], a0);
890
+ ret1 = vdotq_s32(ret1, b_low.val[1], a0);
891
+ // 8..15
892
+ ret0 = vdotq_s32(ret0, b_low.val[2], a1);
893
+ ret1 = vdotq_s32(ret1, b_low.val[3], a1);
894
+ // 16..23
895
+ ret0 = vdotq_s32(ret0, b_high.val[0], a2);
896
+ ret1 = vdotq_s32(ret1, b_high.val[1], a2);
897
+ // 24..31
898
+ ret0 = vdotq_s32(ret0, b_high.val[2], a3);
899
+ ret1 = vdotq_s32(ret1, b_high.val[3], a3);
900
+
901
+ int32x4_t ret = vpaddq_s32(ret0, ret1);
902
+
903
+ acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
904
+ a_ptr++;
905
+ b_ptr++;
906
+ }
907
+ vst1q_f32(s, acc);
908
+ s += ncols_interleaved;
909
+ }
910
+ return;
911
+
912
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
913
+ ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
914
+ }
915
+
789
916
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
790
917
  const int qk = QK8_0;
791
918
  const int nb = n / qk;
@@ -2610,3 +2737,159 @@ void ggml_gemm_q4_K_8x8_q8_K(int n,
2610
2737
  #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2611
2738
  ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
2612
2739
  }
2740
+
2741
+
2742
+ void ggml_gemm_q8_0_4x4_q8_0(int n,
2743
+ float * GGML_RESTRICT s,
2744
+ size_t bs,
2745
+ const void * GGML_RESTRICT vx,
2746
+ const void * GGML_RESTRICT vy,
2747
+ int nr,
2748
+ int nc) {
2749
+ const int qk = QK8_0;
2750
+ const int nb = n / qk;
2751
+ const int ncols_interleaved = 4;
2752
+ const int blocklen = 4;
2753
+
2754
+ assert(n % qk == 0);
2755
+ assert(nr % 4 == 0);
2756
+ assert(nc % ncols_interleaved == 0);
2757
+
2758
+ UNUSED(nb);
2759
+ UNUSED(ncols_interleaved);
2760
+ UNUSED(blocklen);
2761
+
2762
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2763
+ for (int y = 0; y < nr / 4; y++) {
2764
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2765
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
2766
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
2767
+
2768
+ float32x4_t sumf[4];
2769
+ for (int m = 0; m < 4; m++) {
2770
+ sumf[m] = vdupq_n_f32(0);
2771
+ }
2772
+
2773
+ for (int l = 0; l < nb; l++) {
2774
+ float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *) a_ptr[l].d));
2775
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *) b_ptr[l].d));
2776
+
2777
+ int32x4_t sumi_0 = vdupq_n_s32(0);
2778
+ int32x4_t sumi_1 = vdupq_n_s32(0);
2779
+ int32x4_t sumi_2 = vdupq_n_s32(0);
2780
+ int32x4_t sumi_3 = vdupq_n_s32(0);
2781
+
2782
+ for (int k_group = 0; k_group < 8; k_group += 4) {
2783
+ int8x16x4_t a = vld1q_s8_x4(a_ptr[l].qs + 16 * k_group);
2784
+ int8x16x4_t b = vld1q_s8_x4(b_ptr[l].qs + 16 * k_group);
2785
+
2786
+ for (int k = 0; k < 4; k++) {
2787
+ sumi_0 = vdotq_laneq_s32(sumi_0, b.val[k], a.val[k], 0);
2788
+ sumi_1 = vdotq_laneq_s32(sumi_1, b.val[k], a.val[k], 1);
2789
+ sumi_2 = vdotq_laneq_s32(sumi_2, b.val[k], a.val[k], 2);
2790
+ sumi_3 = vdotq_laneq_s32(sumi_3, b.val[k], a.val[k], 3);
2791
+ }
2792
+ }
2793
+
2794
+ sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
2795
+ sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
2796
+ sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
2797
+ sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
2798
+ }
2799
+
2800
+ for (int m = 0; m < 4; m++) {
2801
+ vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
2802
+ }
2803
+ }
2804
+ }
2805
+ return;
2806
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
2807
+ ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
2808
+ }
2809
+
2810
+ void ggml_gemm_q8_0_4x8_q8_0(int n,
2811
+ float * GGML_RESTRICT s,
2812
+ size_t bs,
2813
+ const void * GGML_RESTRICT vx,
2814
+ const void * GGML_RESTRICT vy,
2815
+ int nr,
2816
+ int nc) {
2817
+ const int qk = QK8_0;
2818
+ const int nb = n / qk;
2819
+ const int ncols_interleaved = 4;
2820
+ const int blocklen = 8;
2821
+
2822
+ assert(n % qk == 0);
2823
+ assert(nr % 4 == 0);
2824
+ assert(nc % ncols_interleaved == 0);
2825
+
2826
+ UNUSED(nb);
2827
+ UNUSED(ncols_interleaved);
2828
+ UNUSED(blocklen);
2829
+
2830
+ #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2831
+ const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
2832
+
2833
+ for (int y = 0; y < nr; y += 4) {
2834
+ const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
2835
+
2836
+ for (int x = 0; x < nc; x += ncols_interleaved) {
2837
+ const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
2838
+ const block_q8_0x4 * a_ptr = a_ptr_base;
2839
+
2840
+ float32x4_t acc_f32[4];
2841
+ for (int i = 0; i < 4; i++) {
2842
+ acc_f32[i] = vdupq_n_f32(0);
2843
+ }
2844
+
2845
+ for (int b = 0; b < nb; b++) {
2846
+ int32x4_t acc[4];
2847
+ for (int i = 0; i < 4; i++) {
2848
+ acc[i] = vdupq_n_s32(0);
2849
+ }
2850
+
2851
+ // Process 4 chunks of 8 positions each
2852
+ for (int chunk = 0; chunk < 4; chunk++) {
2853
+ int8x16_t a01 = vld1q_s8(a_ptr->qs + chunk * 32);
2854
+ int8x16_t a23 = vld1q_s8(a_ptr->qs + chunk * 32 + 16);
2855
+ int8x16_t b01 = vld1q_s8(b_ptr->qs + chunk * 32);
2856
+ int8x16_t b23 = vld1q_s8(b_ptr->qs + chunk * 32 + 16);
2857
+
2858
+ acc[0] = vmmlaq_s32(acc[0], a01, b01);
2859
+ acc[1] = vmmlaq_s32(acc[1], a01, b23);
2860
+ acc[2] = vmmlaq_s32(acc[2], a23, b01);
2861
+ acc[3] = vmmlaq_s32(acc[3], a23, b23);
2862
+ }
2863
+
2864
+ // Reorder outputs from 2×2 tiles to row-major
2865
+ // acc[0] = [r0c0, r0c1, r1c0, r1c1]
2866
+ // acc[1] = [r0c2, r0c3, r1c2, r1c3]
2867
+ // acc[2] = [r2c0, r2c1, r3c0, r3c1]
2868
+ // acc[3] = [r2c2, r2c3, r3c2, r3c3]
2869
+ int32x4_t row0 = vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1]));
2870
+ int32x4_t row1 = vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1]));
2871
+ int32x4_t row2 = vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3]));
2872
+ int32x4_t row3 = vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3]));
2873
+
2874
+ // Scales
2875
+ float32x4_t a_d = vcvt_f32_f16(vld1_f16((const __fp16 *) a_ptr->d));
2876
+ float32x4_t b_d = vcvt_f32_f16(vld1_f16((const __fp16 *) b_ptr->d));
2877
+
2878
+ acc_f32[0] = vfmaq_f32(acc_f32[0], vcvtq_f32_s32(row0), vmulq_laneq_f32(b_d, a_d, 0));
2879
+ acc_f32[1] = vfmaq_f32(acc_f32[1], vcvtq_f32_s32(row1), vmulq_laneq_f32(b_d, a_d, 1));
2880
+ acc_f32[2] = vfmaq_f32(acc_f32[2], vcvtq_f32_s32(row2), vmulq_laneq_f32(b_d, a_d, 2));
2881
+ acc_f32[3] = vfmaq_f32(acc_f32[3], vcvtq_f32_s32(row3), vmulq_laneq_f32(b_d, a_d, 3));
2882
+
2883
+ a_ptr++;
2884
+ b_ptr++;
2885
+ }
2886
+
2887
+ for (int row = 0; row < 4; row++) {
2888
+ vst1q_f32(s + (y + row) * bs + x, acc_f32[row]);
2889
+ }
2890
+ }
2891
+ }
2892
+ return;
2893
+ #endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2894
+ ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
2895
+ }
@@ -43,6 +43,8 @@
43
43
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
44
44
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
45
45
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
46
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
47
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
46
48
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
47
49
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
48
50
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -51,6 +53,8 @@
51
53
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
52
54
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
53
55
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
56
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
57
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
54
58
  #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
55
59
  // repack.cpp
56
60
  #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -67,10 +71,14 @@
67
71
  #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
68
72
  #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
69
73
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
74
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
75
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
70
76
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
71
77
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
72
78
  #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
73
79
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
80
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
81
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
74
82
  #elif defined(__POWERPC__) || defined(__powerpc__)
75
83
  // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
76
84
  // quants.c
@@ -91,6 +99,8 @@
91
99
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
92
100
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
93
101
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
102
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
103
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
94
104
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
95
105
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
96
106
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -99,6 +109,8 @@
99
109
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
100
110
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
101
111
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
112
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
113
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
102
114
  #elif defined(__loongarch64)
103
115
  // quants.c
104
116
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -119,6 +131,8 @@
119
131
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
120
132
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
121
133
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
134
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
135
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
122
136
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
123
137
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
124
138
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -127,6 +141,8 @@
127
141
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
128
142
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
129
143
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
144
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
145
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
130
146
  #elif defined(__riscv)
131
147
  // quants.c
132
148
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -154,6 +170,8 @@
154
170
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
155
171
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
156
172
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
173
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
174
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
157
175
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
158
176
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
159
177
  #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
@@ -161,6 +179,8 @@
161
179
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
162
180
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
163
181
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
182
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
183
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
164
184
  #elif defined(__s390x__)
165
185
  // quants.c
166
186
  #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -187,6 +207,8 @@
187
207
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
188
208
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
189
209
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
210
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
211
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
190
212
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
191
213
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
192
214
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -195,6 +217,8 @@
195
217
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
196
218
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
197
219
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
220
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
221
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
198
222
  #elif defined(__wasm__)
199
223
  // quants.c
200
224
  #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -223,6 +247,8 @@
223
247
  #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
224
248
  #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
225
249
  #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
250
+ #define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
251
+ #define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
226
252
  #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
227
253
  #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
228
254
  #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -231,4 +257,6 @@
231
257
  #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
232
258
  #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
233
259
  #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
260
+ #define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
261
+ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
234
262
  #endif
@@ -3320,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3320
3320
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3321
3321
  _mm_storeu_ps(y + i, y_vec);
3322
3322
  }
3323
- #elif defined(__riscv_zvfh)
3324
- for (int vl; i < n; i += vl) {
3325
- vl = __riscv_vsetvl_e16m1(n - i);
3326
- vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
3327
- vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
3328
- __riscv_vse32_v_f32m2(&y[i], vy, vl);
3323
+
3324
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
3325
+ // calculate step size
3326
+ const int epr = __riscv_vsetvlmax_e16m2();
3327
+ const int step = epr * 2;
3328
+ const int np = (n & ~(step - 1));
3329
+
3330
+ // unroll by 2
3331
+ for (; i < np; i += step) {
3332
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
3333
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
3334
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3335
+
3336
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
3337
+ vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
3338
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3329
3339
  }
3340
+
3341
+ // leftovers
3342
+ int vl;
3343
+ for (i = np; i < n; i += vl) {
3344
+ vl = __riscv_vsetvl_e16m2(n - i);
3345
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
3346
+ vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
3347
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3348
+ }
3349
+
3330
3350
  #endif
3331
3351
 
3332
3352
  for (; i < n; ++i) {
@@ -3371,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3371
3391
  (const __m128i *)(x + i))),
3372
3392
  16)));
3373
3393
  }
3394
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
3395
+ // calculate step size
3396
+ const int epr = __riscv_vsetvlmax_e16m2();
3397
+ const int step = epr * 2;
3398
+ const int np = (n & ~(step - 1));
3399
+
3400
+ // unroll by 2
3401
+ for (; i < np; i += step) {
3402
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
3403
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
3404
+ __riscv_vse32_v_f32m4(y + i, ay0, epr);
3405
+
3406
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
3407
+ vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
3408
+ __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
3409
+ }
3410
+
3411
+ // leftovers
3412
+ int vl;
3413
+ for (i = np; i < n; i += vl) {
3414
+ vl = __riscv_vsetvl_e16m2(n - i);
3415
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
3416
+ vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
3417
+ __riscv_vse32_v_f32m4(y + i, ay0, vl);
3418
+ }
3374
3419
  #endif
3375
3420
  for (; i < n; i++) {
3376
3421
  y[i] = GGML_BF16_TO_FP32(x[i]);