cui-llama.rn 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-quants.c CHANGED
@@ -4003,42 +4003,141 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
4003
4003
  float sumf = 0;
4004
4004
 
4005
4005
  #if defined(__ARM_FEATURE_SVE)
4006
- if (lm_ggml_sve_cnt_b == QK8_0) {
4007
- const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
4008
- const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
4006
+ svfloat32_t sumv0 = svdup_n_f32(0.0f);
4007
+ svfloat32_t sumv1 = svdup_n_f32(0.0f);
4009
4008
 
4010
- svfloat32_t sumv0 = svdup_n_f32(0.0f);
4011
- svfloat32_t sumv1 = svdup_n_f32(0.0f);
4009
+ const int vector_length = lm_ggml_sve_cnt_b*8;
4012
4010
 
4013
- for (; ib + 1 < nb; ib += 2) {
4014
- const block_q4_0 * restrict x0 = &x[ib + 0];
4015
- const block_q4_0 * restrict x1 = &x[ib + 1];
4016
- const block_q8_0 * restrict y0 = &y[ib + 0];
4017
- const block_q8_0 * restrict y1 = &y[ib + 1];
4018
-
4019
- // load x
4020
- const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
4021
- const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
4022
-
4023
- // 4-bit -> 8-bit
4024
- const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
4025
- const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
4026
-
4027
- // sub 8
4028
- const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
4029
- const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
4011
+ // VLA Implementation using switch case
4012
+ switch (vector_length) {
4013
+ case 128:
4014
+ {
4015
+ // predicate for activating higher lanes for 4 float32 elements
4016
+ const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
4017
+
4018
+ for (; ib + 1 < nb; ib += 2) {
4019
+ const block_q4_0 * restrict x0 = &x[ib + 0];
4020
+ const block_q4_0 * restrict x1 = &x[ib + 1];
4021
+ const block_q8_0 * restrict y0 = &y[ib + 0];
4022
+ const block_q8_0 * restrict y1 = &y[ib + 1];
4023
+
4024
+ // load x
4025
+ const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
4026
+ const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
4027
+
4028
+ // 4-bit -> 8-bit
4029
+ const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
4030
+ const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
4031
+ const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
4032
+ const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
4033
+
4034
+ // sub 8
4035
+ const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
4036
+ const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
4037
+ const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
4038
+ const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
4039
+
4040
+ // load y
4041
+ const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
4042
+ const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
4043
+ const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
4044
+ const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
4045
+
4046
+ // dot product
4047
+ sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
4048
+ svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
4049
+ svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
4050
+ sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
4051
+ svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
4052
+ svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
4053
+ }
4030
4054
 
4031
- // load y
4032
- const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
4033
- const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
4055
+ sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
4056
+ } break;
4057
+ case 256:
4058
+ {
4059
+ // predicate for activating higher lanes for 16 int8 elements
4060
+ const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
4061
+ // predicate for activating lower lanes for 16 int8 elements
4062
+ const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
4063
+
4064
+ for (; ib + 1 < nb; ib += 2) {
4065
+ const block_q4_0 * restrict x0 = &x[ib + 0];
4066
+ const block_q4_0 * restrict x1 = &x[ib + 1];
4067
+ const block_q8_0 * restrict y0 = &y[ib + 0];
4068
+ const block_q8_0 * restrict y1 = &y[ib + 1];
4069
+
4070
+ // load x
4071
+ const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
4072
+ const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
4073
+
4074
+ // 4-bit -> 8-bit
4075
+ const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
4076
+ const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
4077
+
4078
+ // sub 8
4079
+ const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
4080
+ const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
4081
+
4082
+ // load y
4083
+ const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
4084
+ const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
4085
+
4086
+ // dot product
4087
+ sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
4088
+ svdot_s32(svdup_n_s32(0), qx0s, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
4089
+ sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
4090
+ svdot_s32(svdup_n_s32(0), qx1s, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
4091
+ }
4034
4092
 
4035
- // dot product
4036
- sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
4037
- sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
4038
- }
4093
+ sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
4094
+ } break;
4095
+ case 512:
4096
+ {
4097
+ // predicate for activating higher lanes for 32 int8 elements
4098
+ const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
4099
+
4100
+ // predicate for activating higher lanes for 16 int8 elements
4101
+ const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
4102
+ // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
4103
+ const svbool_t pl16 = svnot_b_z(ph32, ph16);
4104
+
4105
+ for (; ib + 1 < nb; ib += 2) {
4106
+ const block_q4_0 * restrict x0 = &x[ib + 0];
4107
+ const block_q4_0 * restrict x1 = &x[ib + 1];
4108
+ const block_q8_0 * restrict y0 = &y[ib + 0];
4109
+ const block_q8_0 * restrict y1 = &y[ib + 1];
4110
+
4111
+ // load x
4112
+ const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
4113
+ const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
4114
+
4115
+ // 4-bit -> 8-bit
4116
+ const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
4117
+ const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
4118
+
4119
+ // sub 8
4120
+ const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
4121
+ const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
4122
+
4123
+ // load y
4124
+ const svint8_t qy0 = svld1_s8(ph32, y0->qs);
4125
+ const svint8_t qy1 = svld1_s8(ph32, y1->qs);
4126
+
4127
+ // dot product
4128
+ sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
4129
+ svdot_s32(svdup_n_s32(0), qx0s, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
4130
+ sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
4131
+ svdot_s32(svdup_n_s32(0), qx1s, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
4132
+ }
4039
4133
 
4040
- sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
4134
+ sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
4135
+ } break;
4136
+ default:
4137
+ assert(false && "Unsupported vector length");
4138
+ break;
4041
4139
  }
4140
+
4042
4141
  #elif defined(__ARM_NEON)
4043
4142
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
4044
4143
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -5488,29 +5587,124 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
5488
5587
  float sumf = 0;
5489
5588
 
5490
5589
  #if defined(__ARM_FEATURE_SVE)
5491
- if (lm_ggml_sve_cnt_b == QK8_0) {
5492
- svfloat32_t sumv0 = svdup_n_f32(0.0f);
5493
- svfloat32_t sumv1 = svdup_n_f32(0.0f);
5590
+ svfloat32_t sumv0 = svdup_n_f32(0.0f);
5591
+ svfloat32_t sumv1 = svdup_n_f32(0.0f);
5494
5592
 
5495
- for (; ib + 1 < nb; ib += 2) {
5496
- const block_q8_0 * restrict x0 = &x[ib + 0];
5497
- const block_q8_0 * restrict x1 = &x[ib + 1];
5498
- const block_q8_0 * restrict y0 = &y[ib + 0];
5499
- const block_q8_0 * restrict y1 = &y[ib + 1];
5593
+ const int vector_length = lm_ggml_sve_cnt_b*8;
5500
5594
 
5501
- // load x
5502
- const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
5503
- const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
5595
+ //VLA Implemenation for SVE
5596
+ switch (vector_length) {
5597
+ case 128:
5598
+ {
5599
+ // predicate for activating lanes for 16 Int8 elements
5600
+ const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
5601
+ const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
5602
+
5603
+ for (; ib + 1 < nb; ib += 2) {
5604
+ const block_q8_0 * restrict x0 = &x[ib + 0];
5605
+ const block_q8_0 * restrict x1 = &x[ib + 1];
5606
+ const block_q8_0 * restrict y0 = &y[ib + 0];
5607
+ const block_q8_0 * restrict y1 = &y[ib + 1];
5608
+
5609
+ // load x
5610
+ const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
5611
+ const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
5612
+ const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
5613
+ const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
5614
+
5615
+ // load y
5616
+ const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
5617
+ const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
5618
+ const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
5619
+ const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
5620
+
5621
+ sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
5622
+ svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
5623
+ svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
5624
+ sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
5625
+ svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
5626
+ svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
5627
+ }
5504
5628
 
5505
- // load y
5506
- const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
5507
- const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
5629
+ sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
5630
+ } break;
5631
+ case 256:
5632
+ {
5633
+ //printf("sve256");
5634
+ for (; ib + 1 < nb; ib += 2) {
5635
+ const block_q8_0 * restrict x0 = &x[ib + 0];
5636
+ const block_q8_0 * restrict x1 = &x[ib + 1];
5637
+ const block_q8_0 * restrict y0 = &y[ib + 0];
5638
+ const block_q8_0 * restrict y1 = &y[ib + 1];
5639
+
5640
+ // load x
5641
+ const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
5642
+ const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
5643
+
5644
+ // load y
5645
+ const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
5646
+ const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
5647
+
5648
+ sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
5649
+ svdot_s32(svdup_n_s32(0), qx0, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
5650
+ sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
5651
+ svdot_s32(svdup_n_s32(0), qx1, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
5652
+ }
5508
5653
 
5509
- sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d));
5510
- sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d));
5511
- }
5654
+ sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
5655
+ } break;
5656
+ case 512:
5657
+ {
5658
+ // predicate for activating high 256 bit
5659
+ const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
5660
+ // predicate for activating low 256 bit
5661
+ const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
5662
+
5663
+ // predicate for activating high lanes for 8 float32 elements
5664
+ const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
5665
+ // predicate for activating low lanes for 8 float32 elements
5666
+ const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
5667
+
5668
+ svfloat32_t sumv00 = svdup_n_f32(0.0f);
5669
+
5670
+ for (; ib + 1 < nb; ib += 2) {
5671
+ const block_q8_0 * restrict x0 = &x[ib + 0];
5672
+ const block_q8_0 * restrict x1 = &x[ib + 1];
5673
+ const block_q8_0 * restrict y0 = &y[ib + 0];
5674
+ const block_q8_0 * restrict y1 = &y[ib + 1];
5675
+
5676
+ //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
5677
+ // and add them to make one 64 element vector
5678
+ // load x
5679
+ const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
5680
+ svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
5681
+
5682
+ qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
5512
5683
 
5513
- sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
5684
+ // load y
5685
+ const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
5686
+ svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
5687
+
5688
+ qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
5689
+
5690
+ // scale creation
5691
+ const float32_t deq1 = LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d);
5692
+ const float32_t deq2 = LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d);
5693
+
5694
+ // duplicate deq1 in first half of vector and deq2 in second half of vector
5695
+ const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
5696
+
5697
+ const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
5698
+
5699
+ sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
5700
+ }
5701
+
5702
+ sumf = svaddv_f32(svptrue_b32(), sumv00);
5703
+ break;
5704
+ }
5705
+ default:
5706
+ assert(false && "Unsupported vector length");
5707
+ break;
5514
5708
  }
5515
5709
  #elif defined(__ARM_NEON)
5516
5710
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
package/cpp/ggml.c CHANGED
@@ -287,6 +287,7 @@ void lm_ggml_abort(const char * file, int line, const char * fmt, ...) {
287
287
  #define LM_GGML_DEBUG 0
288
288
  #define LM_GGML_GELU_FP16
289
289
  #define LM_GGML_GELU_QUICK_FP16
290
+ #define LM_GGML_N_TASKS_MAX (-1)
290
291
 
291
292
  #define LM_GGML_SOFT_MAX_UNROLL 4
292
293
  #define LM_GGML_VEC_DOT_UNROLL 2
@@ -1120,21 +1121,21 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) {
1120
1121
  #define LM_GGML_F32x4_ADD vaddq_f32
1121
1122
  #define LM_GGML_F32x4_MUL vmulq_f32
1122
1123
  #define LM_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1123
- #define LM_GGML_F32x4_REDUCE(res, x) \
1124
- { \
1125
- int offset = LM_GGML_F32_ARR >> 1; \
1126
- for (int i = 0; i < offset; ++i) { \
1127
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1128
- } \
1129
- offset >>= 1; \
1130
- for (int i = 0; i < offset; ++i) { \
1131
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1132
- } \
1133
- offset >>= 1; \
1134
- for (int i = 0; i < offset; ++i) { \
1135
- x[i] = vaddq_f32(x[i], x[offset+i]); \
1136
- } \
1137
- res = LM_GGML_F32x4_REDUCE_ONE(x[0]); \
1124
+ #define LM_GGML_F32x4_REDUCE(res, x) \
1125
+ { \
1126
+ int offset = LM_GGML_F32_ARR >> 1; \
1127
+ for (int i = 0; i < offset; ++i) { \
1128
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1129
+ } \
1130
+ offset >>= 1; \
1131
+ for (int i = 0; i < offset; ++i) { \
1132
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1133
+ } \
1134
+ offset >>= 1; \
1135
+ for (int i = 0; i < offset; ++i) { \
1136
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
1137
+ } \
1138
+ (res) = LM_GGML_F32x4_REDUCE_ONE((x)[0]); \
1138
1139
  }
1139
1140
 
1140
1141
  #define LM_GGML_F32_VEC LM_GGML_F32x4
@@ -1161,30 +1162,30 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) {
1161
1162
  #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
1162
1163
  #define LM_GGML_F16x8_ADD vaddq_f16
1163
1164
  #define LM_GGML_F16x8_MUL vmulq_f16
1164
- #define LM_GGML_F16x8_REDUCE(res, x) \
1165
- do { \
1166
- int offset = LM_GGML_F16_ARR >> 1; \
1167
- for (int i = 0; i < offset; ++i) { \
1168
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1169
- } \
1170
- offset >>= 1; \
1171
- for (int i = 0; i < offset; ++i) { \
1172
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1173
- } \
1174
- offset >>= 1; \
1175
- for (int i = 0; i < offset; ++i) { \
1176
- x[i] = vaddq_f16(x[i], x[offset+i]); \
1177
- } \
1178
- const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1179
- const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
1180
- res = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
1165
+ #define LM_GGML_F16x8_REDUCE(res, x) \
1166
+ do { \
1167
+ int offset = LM_GGML_F16_ARR >> 1; \
1168
+ for (int i = 0; i < offset; ++i) { \
1169
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1170
+ } \
1171
+ offset >>= 1; \
1172
+ for (int i = 0; i < offset; ++i) { \
1173
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1174
+ } \
1175
+ offset >>= 1; \
1176
+ for (int i = 0; i < offset; ++i) { \
1177
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
1178
+ } \
1179
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
1180
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
1181
+ (res) = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
1181
1182
  } while (0)
1182
1183
 
1183
1184
  #define LM_GGML_F16_VEC LM_GGML_F16x8
1184
1185
  #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO
1185
1186
  #define LM_GGML_F16_VEC_SET1 LM_GGML_F16x8_SET1
1186
1187
  #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x8_LOAD(p)
1187
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE((lm_ggml_fp16_internal_t *)(p), r[i])
1188
+ #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE((lm_ggml_fp16_internal_t *)(p), (r)[i])
1188
1189
  #define LM_GGML_F16_VEC_FMA LM_GGML_F16x8_FMA
1189
1190
  #define LM_GGML_F16_VEC_ADD LM_GGML_F16x8_ADD
1190
1191
  #define LM_GGML_F16_VEC_MUL LM_GGML_F16x8_MUL
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
1893
1894
  #define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR)
1894
1895
  #endif
1895
1896
 
1897
+ //
1898
+ // ggml object
1899
+ //
1900
+
1901
+ struct lm_ggml_object {
1902
+ size_t offs;
1903
+ size_t size;
1904
+
1905
+ struct lm_ggml_object * next;
1906
+
1907
+ enum lm_ggml_object_type type;
1908
+
1909
+ char padding[4];
1910
+ };
1911
+
1912
+ static const size_t LM_GGML_OBJECT_SIZE = sizeof(struct lm_ggml_object);
1913
+
1896
1914
  //
1897
1915
  // ggml context
1898
1916
  //
@@ -3381,7 +3399,7 @@ double lm_ggml_type_sizef(enum lm_ggml_type type) {
3381
3399
  }
3382
3400
 
3383
3401
  LM_GGML_CALL const char * lm_ggml_type_name(enum lm_ggml_type type) {
3384
- return type_traits[type].type_name;
3402
+ return type < LM_GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
3385
3403
  }
3386
3404
 
3387
3405
  LM_GGML_CALL bool lm_ggml_is_quantized(enum lm_ggml_type type) {
@@ -3847,7 +3865,7 @@ static struct lm_ggml_object * lm_ggml_new_object(struct lm_ggml_context * ctx,
3847
3865
 
3848
3866
  if (cur_end + size_needed + LM_GGML_OBJECT_SIZE > ctx->mem_size) {
3849
3867
  LM_GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
3850
- __func__, cur_end + size_needed, ctx->mem_size);
3868
+ __func__, cur_end + size_needed + LM_GGML_OBJECT_SIZE, ctx->mem_size);
3851
3869
  assert(false);
3852
3870
  return NULL;
3853
3871
  }
@@ -19161,6 +19179,34 @@ void lm_ggml_graph_clear(struct lm_ggml_cgraph * cgraph) {
19161
19179
  lm_ggml_hash_set_reset(&cgraph->visited_hash_set);
19162
19180
  }
19163
19181
 
19182
+ int lm_ggml_graph_size(struct lm_ggml_cgraph * cgraph) {
19183
+ return cgraph->size;
19184
+ }
19185
+
19186
+ struct lm_ggml_tensor * lm_ggml_graph_node(struct lm_ggml_cgraph * cgraph, int i) {
19187
+ if (i < 0) {
19188
+ LM_GGML_ASSERT(cgraph->n_nodes + i >= 0);
19189
+ return cgraph->nodes[cgraph->n_nodes + i];
19190
+ }
19191
+
19192
+ LM_GGML_ASSERT(i < cgraph->n_nodes);
19193
+ return cgraph->nodes[i];
19194
+ }
19195
+
19196
+ struct lm_ggml_tensor ** lm_ggml_graph_nodes(struct lm_ggml_cgraph * cgraph) {
19197
+ return cgraph->nodes;
19198
+ }
19199
+
19200
+ int lm_ggml_graph_n_nodes(struct lm_ggml_cgraph * cgraph) {
19201
+ return cgraph->n_nodes;
19202
+ }
19203
+
19204
+ void lm_ggml_graph_add_node(struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor) {
19205
+ LM_GGML_ASSERT(cgraph->size > cgraph->n_nodes);
19206
+ cgraph->nodes[cgraph->n_nodes] = tensor;
19207
+ cgraph->n_nodes++;
19208
+ }
19209
+
19164
19210
  // Android's libc implementation "bionic" does not support setting affinity
19165
19211
  #if defined(__gnu_linux__)
19166
19212
  static void set_numa_thread_affinity(int thread_n) {
@@ -23242,6 +23288,14 @@ int lm_ggml_cpu_has_arm_fma(void) {
23242
23288
  #endif
23243
23289
  }
23244
23290
 
23291
+ int lm_ggml_cpu_has_riscv_v(void) {
23292
+ #if defined(__riscv_v_intrinsic)
23293
+ return 1;
23294
+ #else
23295
+ return 0;
23296
+ #endif
23297
+ }
23298
+
23245
23299
  int lm_ggml_cpu_has_metal(void) {
23246
23300
  #if defined(LM_GGML_USE_METAL)
23247
23301
  return 1;
package/cpp/ggml.h CHANGED
@@ -358,6 +358,7 @@ extern "C" {
358
358
 
359
359
  struct lm_ggml_object;
360
360
  struct lm_ggml_context;
361
+ struct lm_ggml_cgraph;
361
362
 
362
363
  // NOTE: always add types at the end of the enum to keep backward compatibility
363
364
  enum lm_ggml_type {
@@ -575,23 +576,9 @@ extern "C" {
575
576
  LM_GGML_TENSOR_FLAG_PARAM = 4,
576
577
  };
577
578
 
578
- // ggml object
579
- struct lm_ggml_object {
580
- size_t offs;
581
- size_t size;
582
-
583
- struct lm_ggml_object * next;
584
-
585
- enum lm_ggml_object_type type;
586
-
587
- char padding[4];
588
- };
589
-
590
- static const size_t LM_GGML_OBJECT_SIZE = sizeof(struct lm_ggml_object);
591
-
592
579
  // n-dimensional tensor
593
580
  struct lm_ggml_tensor {
594
- enum lm_ggml_type type;
581
+ enum lm_ggml_type type;
595
582
 
596
583
  LM_GGML_DEPRECATED(enum lm_ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
597
584
 
@@ -655,7 +642,7 @@ extern "C" {
655
642
 
656
643
  struct lm_ggml_threadpool; // forward declaration, see ggml.c
657
644
 
658
- typedef struct lm_ggml_threadpool * lm_ggml_threadpool_t;
645
+ typedef struct lm_ggml_threadpool * lm_ggml_threadpool_t;
659
646
 
660
647
  // the compute plan that needs to be prepared for lm_ggml_graph_compute()
661
648
  // since https://github.com/ggerganov/ggml/issues/287
@@ -671,35 +658,6 @@ extern "C" {
671
658
  void * abort_callback_data;
672
659
  };
673
660
 
674
- enum lm_ggml_cgraph_eval_order {
675
- LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
676
- LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
677
- LM_GGML_CGRAPH_EVAL_ORDER_COUNT
678
- };
679
-
680
- typedef uint32_t lm_ggml_bitset_t;
681
-
682
- struct lm_ggml_hash_set {
683
- size_t size;
684
- lm_ggml_bitset_t * used; // whether or not the keys are in use i.e. set
685
- struct lm_ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if lm_ggml_bitset_get(used, i)
686
- };
687
-
688
- // computation graph
689
- struct lm_ggml_cgraph {
690
- int size;
691
- int n_nodes;
692
- int n_leafs;
693
-
694
- struct lm_ggml_tensor ** nodes;
695
- struct lm_ggml_tensor ** grads;
696
- struct lm_ggml_tensor ** leafs;
697
-
698
- struct lm_ggml_hash_set visited_hash_set;
699
-
700
- enum lm_ggml_cgraph_eval_order order;
701
- };
702
-
703
661
  // scratch buffer
704
662
  struct lm_ggml_scratch {
705
663
  size_t offs;
@@ -2017,8 +1975,6 @@ extern "C" {
2017
1975
  typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
2018
1976
  typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
2019
1977
 
2020
- #define LM_GGML_N_TASKS_MAX -1
2021
-
2022
1978
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1(
2023
1979
  struct lm_ggml_context * ctx,
2024
1980
  struct lm_ggml_tensor * a,
@@ -2088,30 +2044,35 @@ extern "C" {
2088
2044
  struct lm_ggml_context * ctx,
2089
2045
  struct lm_ggml_tensor * tensor);
2090
2046
 
2091
-
2092
2047
  LM_GGML_API void lm_ggml_build_forward_expand (struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor);
2093
2048
  LM_GGML_API void lm_ggml_build_backward_expand(struct lm_ggml_context * ctx, struct lm_ggml_cgraph * gf, struct lm_ggml_cgraph * gb, bool keep);
2094
2049
 
2095
2050
  // graph allocation in a context
2096
- LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false
2097
- LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom (struct lm_ggml_context * ctx, size_t size, bool grads);
2098
- LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph);
2099
- LM_GGML_API struct lm_ggml_cgraph lm_ggml_graph_view (struct lm_ggml_cgraph * cgraph, int i0, int i1);
2100
- LM_GGML_API void lm_ggml_graph_cpy (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst);
2101
- LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // zero grads
2102
- LM_GGML_API void lm_ggml_graph_clear (struct lm_ggml_cgraph * cgraph);
2051
+ LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph (struct lm_ggml_context * ctx); // size = LM_GGML_DEFAULT_GRAPH_SIZE, grads = false
2052
+ LM_GGML_API struct lm_ggml_cgraph * lm_ggml_new_graph_custom(struct lm_ggml_context * ctx, size_t size, bool grads);
2053
+ LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_dup (struct lm_ggml_context * ctx, struct lm_ggml_cgraph * cgraph);
2054
+ LM_GGML_API void lm_ggml_graph_cpy (struct lm_ggml_cgraph * src, struct lm_ggml_cgraph * dst);
2055
+ LM_GGML_API void lm_ggml_graph_reset (struct lm_ggml_cgraph * cgraph); // zero grads
2056
+ LM_GGML_API void lm_ggml_graph_clear (struct lm_ggml_cgraph * cgraph);
2057
+
2058
+ LM_GGML_API int lm_ggml_graph_size (struct lm_ggml_cgraph * cgraph);
2059
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_graph_node (struct lm_ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2060
+ LM_GGML_API struct lm_ggml_tensor ** lm_ggml_graph_nodes (struct lm_ggml_cgraph * cgraph);
2061
+ LM_GGML_API int lm_ggml_graph_n_nodes(struct lm_ggml_cgraph * cgraph);
2062
+
2063
+ LM_GGML_API void lm_ggml_graph_add_node(struct lm_ggml_cgraph * cgraph, struct lm_ggml_tensor * tensor);
2103
2064
 
2104
2065
  LM_GGML_API size_t lm_ggml_graph_overhead(void);
2105
2066
  LM_GGML_API size_t lm_ggml_graph_overhead_custom(size_t size, bool grads);
2106
2067
 
2107
- LM_GGML_API struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads);
2108
- LM_GGML_API void lm_ggml_threadpool_params_init (struct lm_ggml_threadpool_params *p, int n_threads);
2109
- LM_GGML_API bool lm_ggml_threadpool_params_match (const struct lm_ggml_threadpool_params *p0, const struct lm_ggml_threadpool_params *p1);
2110
- LM_GGML_API struct lm_ggml_threadpool* lm_ggml_threadpool_new (struct lm_ggml_threadpool_params * params);
2111
- LM_GGML_API void lm_ggml_threadpool_free (struct lm_ggml_threadpool * threadpool);
2112
- LM_GGML_API int lm_ggml_threadpool_get_n_threads(struct lm_ggml_threadpool * threadpool);
2113
- LM_GGML_API void lm_ggml_threadpool_pause (struct lm_ggml_threadpool * threadpool);
2114
- LM_GGML_API void lm_ggml_threadpool_resume (struct lm_ggml_threadpool * threadpool);
2068
+ LM_GGML_API struct lm_ggml_threadpool_params lm_ggml_threadpool_params_default(int n_threads);
2069
+ LM_GGML_API void lm_ggml_threadpool_params_init (struct lm_ggml_threadpool_params * p, int n_threads);
2070
+ LM_GGML_API bool lm_ggml_threadpool_params_match (const struct lm_ggml_threadpool_params * p0, const struct lm_ggml_threadpool_params * p1);
2071
+ LM_GGML_API struct lm_ggml_threadpool * lm_ggml_threadpool_new (struct lm_ggml_threadpool_params * params);
2072
+ LM_GGML_API void lm_ggml_threadpool_free (struct lm_ggml_threadpool * threadpool);
2073
+ LM_GGML_API int lm_ggml_threadpool_get_n_threads(struct lm_ggml_threadpool * threadpool);
2074
+ LM_GGML_API void lm_ggml_threadpool_pause (struct lm_ggml_threadpool * threadpool);
2075
+ LM_GGML_API void lm_ggml_threadpool_resume (struct lm_ggml_threadpool * threadpool);
2115
2076
 
2116
2077
  // lm_ggml_graph_plan() has to be called before lm_ggml_graph_compute()
2117
2078
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -2509,6 +2470,7 @@ extern "C" {
2509
2470
  LM_GGML_API int lm_ggml_cpu_has_gpublas (void);
2510
2471
  LM_GGML_API int lm_ggml_cpu_has_sse3 (void);
2511
2472
  LM_GGML_API int lm_ggml_cpu_has_ssse3 (void);
2473
+ LM_GGML_API int lm_ggml_cpu_has_riscv_v (void);
2512
2474
  LM_GGML_API int lm_ggml_cpu_has_sycl (void);
2513
2475
  LM_GGML_API int lm_ggml_cpu_has_rpc (void);
2514
2476
  LM_GGML_API int lm_ggml_cpu_has_vsx (void);