llama_cpp 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
512
512
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
513
513
  }
514
514
 
515
- #if __AVX2__ || __AVX512F__
515
+ #if defined(__AVX2__) || defined(__AVX512F__)
516
516
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
517
517
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
518
518
  uint32_t x32;
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
688
688
  #endif // __AVX__ || __AVX2__ || __AVX512F__
689
689
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
690
690
 
691
- #if __ARM_NEON
691
+ #if defined(__ARM_NEON)
692
692
 
693
693
  #if !defined(__aarch64__)
694
694
 
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740
740
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
741
741
  }
742
742
 
743
- float vminvq_f32(float32x4_t v) {
743
+ inline static float vminvq_f32(float32x4_t v) {
744
744
  return
745
745
  MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
746
746
  MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
747
747
  }
748
748
 
749
- float vmaxvq_f32(float32x4_t v) {
749
+ inline static float vmaxvq_f32(float32x4_t v) {
750
750
  return
751
751
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
752
752
  MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
753
753
  }
754
754
 
755
- int32x4_t vcvtnq_s32_f32(float32x4_t v) {
755
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
756
756
  int32x4_t res;
757
757
 
758
758
  res[0] = roundf(vgetq_lane_f32(v, 0));
@@ -766,21 +766,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
766
  #endif
767
767
  #endif
768
768
 
769
-
770
769
  #define QK4_0 32
771
770
  typedef struct {
772
- float d; // delta
771
+ ggml_fp16_t d; // delta
773
772
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
774
773
  } block_q4_0;
775
- static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
774
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
776
775
 
777
776
  #define QK4_1 32
778
777
  typedef struct {
779
- float d; // delta
780
- float m; // min
778
+ ggml_fp16_t d; // delta
779
+ ggml_fp16_t m; // min
781
780
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
782
781
  } block_q4_1;
783
- static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
782
+ static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
784
783
 
785
784
  #define QK5_0 32
786
785
  typedef struct {
@@ -801,16 +800,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
801
800
 
802
801
  #define QK8_0 32
803
802
  typedef struct {
804
- float d; // delta
805
- int8_t qs[QK8_0]; // quants
803
+ ggml_fp16_t d; // delta
804
+ int8_t qs[QK8_0]; // quants
806
805
  } block_q8_0;
807
- static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
806
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
808
807
 
809
808
  #define QK8_1 32
810
809
  typedef struct {
811
- float d; // delta
812
- float s; // d * sum(qs[i])
813
- int8_t qs[QK8_1]; // quants
810
+ float d; // delta
811
+ float s; // d * sum(qs[i])
812
+ int8_t qs[QK8_1]; // quants
814
813
  } block_q8_1;
815
814
  static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
816
815
 
@@ -837,7 +836,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
837
836
  const float d = max / -8;
838
837
  const float id = d ? 1.0f/d : 0.0f;
839
838
 
840
- y[i].d = d;
839
+ y[i].d = GGML_FP32_TO_FP16(d);
841
840
 
842
841
  for (int j = 0; j < qk/2; ++j) {
843
842
  const float x0 = x[i*qk + 0 + j]*id;
@@ -877,8 +876,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
877
876
  const float d = (max - min) / ((1 << 4) - 1);
878
877
  const float id = d ? 1.0f/d : 0.0f;
879
878
 
880
- y[i].d = d;
881
- y[i].m = min;
879
+ y[i].d = GGML_FP32_TO_FP16(d);
880
+ y[i].m = GGML_FP32_TO_FP16(min);
882
881
 
883
882
  for (int j = 0; j < qk/2; ++j) {
884
883
  const float x0 = (x[i*qk + 0 + j] - min)*id;
@@ -1009,7 +1008,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
1009
1008
  const float d = amax / ((1 << 7) - 1);
1010
1009
  const float id = d ? 1.0f/d : 0.0f;
1011
1010
 
1012
- y[i].d = d;
1011
+ y[i].d = GGML_FP32_TO_FP16(d);
1013
1012
 
1014
1013
  for (int j = 0; j < QK8_0; ++j) {
1015
1014
  const float x0 = x[i*QK8_0 + j]*id;
@@ -1044,7 +1043,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1044
1043
  const float d = amax / ((1 << 7) - 1);
1045
1044
  const float id = d ? 1.0f/d : 0.0f;
1046
1045
 
1047
- y[i].d = d;
1046
+ y[i].d = GGML_FP32_TO_FP16(d);
1048
1047
 
1049
1048
  for (int j = 0; j < 8; j++) {
1050
1049
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1056
1055
  y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1057
1056
  }
1058
1057
  }
1058
+ #elif defined(__wasm_simd128__)
1059
+ for (int i = 0; i < nb; i++) {
1060
+ v128_t srcv [8];
1061
+ v128_t asrcv[8];
1062
+ v128_t amaxv[8];
1063
+
1064
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1065
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1066
+
1067
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1068
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1069
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1070
+
1071
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1072
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1073
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1074
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1075
+
1076
+ const float d = amax / ((1 << 7) - 1);
1077
+ const float id = d ? 1.0f/d : 0.0f;
1078
+
1079
+ y[i].d = GGML_FP32_TO_FP16(d);
1080
+
1081
+ for (int j = 0; j < 8; j++) {
1082
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1083
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1084
+
1085
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1086
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1087
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1088
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1089
+ }
1090
+ }
1059
1091
  #elif defined(__AVX2__) || defined(__AVX__)
1060
1092
  for (int i = 0; i < nb; i++) {
1061
1093
  // Load elements into 4 AVX vectors
@@ -1079,7 +1111,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1079
1111
 
1080
1112
  // Quantize these floats
1081
1113
  const float d = maxScalar / 127.f;
1082
- y[i].d = d;
1114
+ y[i].d = GGML_FP32_TO_FP16(d);
1083
1115
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1084
1116
  const __m256 mul = _mm256_set1_ps( id );
1085
1117
 
@@ -1178,7 +1210,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
1178
1210
  sum += y[i].qs[QK8_1/2 + j];
1179
1211
  }
1180
1212
 
1181
- y[i].s = d * sum;
1213
+ y[i].s = sum*d;
1182
1214
  }
1183
1215
  }
1184
1216
 
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1224
1256
 
1225
1257
  y[i].s = d * vaddvq_s32(accv);
1226
1258
  }
1259
+ #elif defined(__wasm_simd128__)
1260
+ for (int i = 0; i < nb; i++) {
1261
+ v128_t srcv [8];
1262
+ v128_t asrcv[8];
1263
+ v128_t amaxv[8];
1264
+
1265
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1266
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1267
+
1268
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1269
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1270
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1271
+
1272
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1273
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1274
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1275
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1276
+
1277
+ const float d = amax / ((1 << 7) - 1);
1278
+ const float id = d ? 1.0f/d : 0.0f;
1279
+
1280
+ y[i].d = d;
1281
+
1282
+ v128_t accv = wasm_i32x4_splat(0);
1283
+
1284
+ for (int j = 0; j < 8; j++) {
1285
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1286
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1287
+
1288
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1289
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1290
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1291
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1292
+
1293
+ accv = wasm_i32x4_add(accv, vi);
1294
+ }
1295
+
1296
+ y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1297
+ wasm_i32x4_extract_lane(accv, 1) +
1298
+ wasm_i32x4_extract_lane(accv, 2) +
1299
+ wasm_i32x4_extract_lane(accv, 3));
1300
+ }
1227
1301
  #elif defined(__AVX2__) || defined(__AVX__)
1228
1302
  for (int i = 0; i < nb; i++) {
1229
1303
  // Load elements into 4 AVX vectors
@@ -1330,7 +1404,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1330
1404
  const int nb = k / qk;
1331
1405
 
1332
1406
  for (int i = 0; i < nb; i++) {
1333
- const float d = x[i].d;
1407
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1334
1408
 
1335
1409
  for (int j = 0; j < qk/2; ++j) {
1336
1410
  const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1350,8 +1424,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
1350
1424
  const int nb = k / qk;
1351
1425
 
1352
1426
  for (int i = 0; i < nb; i++) {
1353
- const float d = x[i].d;
1354
- const float m = x[i].m;
1427
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1428
+ const float m = GGML_FP16_TO_FP32(x[i].m);
1355
1429
 
1356
1430
  for (int j = 0; j < qk/2; ++j) {
1357
1431
  const int x0 = (x[i].qs[j] & 0x0F);
@@ -1426,7 +1500,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1426
1500
  const block_q8_0 * restrict x = vx;
1427
1501
 
1428
1502
  for (int i = 0; i < nb; i++) {
1429
- const float d = x[i].d;
1503
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1430
1504
 
1431
1505
  for (int j = 0; j < qk; ++j) {
1432
1506
  y[i*qk + j] = x[i].qs[j]*d;
@@ -1690,8 +1764,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1690
1764
  static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
1691
1765
  float tmp[8];
1692
1766
 
1693
- for (int i = 0; i < 8; i++)
1767
+ for (int i = 0; i < 8; i++) {
1694
1768
  tmp[i] = GGML_FP16_TO_FP32(x[i]);
1769
+ }
1695
1770
 
1696
1771
  return _mm256_loadu_ps(tmp);
1697
1772
  }
@@ -2111,8 +2186,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2111
2186
  const block_q8_0 * restrict y0 = &y[i + 0];
2112
2187
  const block_q8_0 * restrict y1 = &y[i + 1];
2113
2188
 
2114
- const uint8x16_t m4b = vdupq_n_u8(0x0F);
2115
- const int8x16_t s8b = vdupq_n_s8(0x8);
2189
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
2190
+ const int8x16_t s8b = vdupq_n_s8(0x8);
2116
2191
 
2117
2192
  const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2118
2193
  const uint8x16_t v0_1 = vld1q_u8(x1->qs);
@@ -2140,8 +2215,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2140
2215
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2141
2216
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2142
2217
 
2143
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2144
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2218
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2219
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2145
2220
  #else
2146
2221
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2147
2222
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2158,8 +2233,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2158
2233
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2159
2234
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2160
2235
 
2161
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2162
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2236
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2237
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2163
2238
  #endif
2164
2239
  }
2165
2240
 
@@ -2171,7 +2246,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2171
2246
  // Main loop
2172
2247
  for (int i = 0; i < nb; ++i) {
2173
2248
  /* Compute combined scale for the block */
2174
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2249
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2175
2250
 
2176
2251
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2177
2252
 
@@ -2195,7 +2270,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2195
2270
  // Main loop
2196
2271
  for (int i = 0; i < nb; ++i) {
2197
2272
  // Compute combined scale for the block
2198
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2273
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2199
2274
 
2200
2275
  const __m128i lowMask = _mm_set1_epi8(0xF);
2201
2276
  const __m128i off = _mm_set1_epi8(8);
@@ -2237,7 +2312,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2237
2312
  _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
2238
2313
 
2239
2314
  // Compute combined scale for the block 0 and 1
2240
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
2315
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
2241
2316
 
2242
2317
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
2243
2318
 
@@ -2255,7 +2330,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2255
2330
  _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
2256
2331
 
2257
2332
  // Compute combined scale for the block 2 and 3
2258
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
2333
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
2259
2334
 
2260
2335
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
2261
2336
 
@@ -2288,7 +2363,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2288
2363
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
2289
2364
 
2290
2365
  // Compute combined scale for the block 0 and 1
2291
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
2366
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2292
2367
 
2293
2368
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
2294
2369
 
@@ -2306,7 +2381,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2306
2381
  _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2307
2382
 
2308
2383
  // Compute combined scale for the block 2 and 3
2309
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
2384
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
2310
2385
 
2311
2386
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
2312
2387
 
@@ -2354,7 +2429,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2354
2429
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2355
2430
  }
2356
2431
 
2357
- sumf += (x[i].d*y[i].d)*sumi;
2432
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2358
2433
  }
2359
2434
 
2360
2435
  *s = sumf;
@@ -2384,7 +2459,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2384
2459
  const block_q8_1 * restrict y0 = &y[i + 0];
2385
2460
  const block_q8_1 * restrict y1 = &y[i + 1];
2386
2461
 
2387
- summs += x0->m * y0->s + x1->m * y1->s;
2462
+ summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
2388
2463
 
2389
2464
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2390
2465
 
@@ -2408,8 +2483,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2408
2483
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2409
2484
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2410
2485
 
2411
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2412
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2486
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
2487
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
2413
2488
  #else
2414
2489
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
2415
2490
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
@@ -2426,8 +2501,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2426
2501
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2427
2502
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2428
2503
 
2429
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2430
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2504
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2505
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2431
2506
  #endif
2432
2507
  }
2433
2508
 
@@ -2440,13 +2515,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2440
2515
 
2441
2516
  // Main loop
2442
2517
  for (int i = 0; i < nb; ++i) {
2443
- const float * d0 = &x[i].d;
2444
- const float * d1 = &y[i].d;
2518
+ const float d0 = GGML_FP16_TO_FP32(x[i].d);
2519
+ const float d1 = y[i].d;
2445
2520
 
2446
- summs += x[i].m * y[i].s;
2521
+ summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
2447
2522
 
2448
- const __m256 d0v = _mm256_broadcast_ss( d0 );
2449
- const __m256 d1v = _mm256_broadcast_ss( d1 );
2523
+ const __m256 d0v = _mm256_set1_ps( d0 );
2524
+ const __m256 d1v = _mm256_set1_ps( d1 );
2450
2525
 
2451
2526
  // Compute combined scales
2452
2527
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
@@ -2480,7 +2555,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2480
2555
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2481
2556
  }
2482
2557
 
2483
- sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
2558
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2484
2559
  }
2485
2560
 
2486
2561
  *s = sumf;
@@ -2556,16 +2631,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2556
2631
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2557
2632
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2558
2633
 
2559
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2560
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2561
-
2562
2634
  #if defined(__ARM_FEATURE_DOTPROD)
2563
2635
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2564
2636
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2565
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2637
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2566
2638
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2567
2639
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2568
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2640
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2569
2641
  #else
2570
2642
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2571
2643
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2582,8 +2654,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2582
2654
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2583
2655
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2584
2656
 
2585
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2586
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2657
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2658
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2587
2659
  #endif
2588
2660
  }
2589
2661
 
@@ -2600,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2600
2672
  const block_q8_0 * restrict y0 = &y[i];
2601
2673
 
2602
2674
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2603
- const v128_t s16b = wasm_i8x16_splat(0x10);
2604
2675
 
2605
2676
  // extract the 5th bit
2606
2677
  memcpy(&qh, x0->qh, sizeof(qh));
@@ -2638,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2638
2709
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2639
2710
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2640
2711
 
2641
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2642
-
2643
2712
  // dot product
2644
2713
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2645
2714
  wasm_i32x4_add(
2646
2715
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2647
2716
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2648
2717
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2649
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
2718
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2719
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
2650
2720
  }
2651
2721
 
2652
2722
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2658,7 +2728,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2658
2728
  // Main loop
2659
2729
  for (int i = 0; i < nb; i++) {
2660
2730
  /* Compute combined scale for the block */
2661
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2731
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2662
2732
 
2663
2733
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2664
2734
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2682,7 +2752,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2682
2752
  // Main loop
2683
2753
  for (int i = 0; i < nb; i++) {
2684
2754
  /* Compute combined scale for the block */
2685
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2755
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2686
2756
 
2687
2757
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2688
2758
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2725,7 +2795,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2725
2795
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
2726
2796
  }
2727
2797
 
2728
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
2798
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
2729
2799
  }
2730
2800
 
2731
2801
  *s = sumf;
@@ -2807,16 +2877,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2807
2877
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2808
2878
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2809
2879
 
2810
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2811
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2812
-
2813
2880
  #if defined(__ARM_FEATURE_DOTPROD)
2814
2881
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2815
2882
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2816
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2883
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
2817
2884
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2818
2885
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2819
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2886
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
2820
2887
  #else
2821
2888
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2822
2889
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2833,8 +2900,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2833
2900
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2834
2901
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2835
2902
 
2836
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2837
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2903
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2904
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2838
2905
  #endif
2839
2906
  }
2840
2907
 
@@ -2873,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2873
2940
  const v128_t v0l = wasm_v128_and (v0, m4b);
2874
2941
  const v128_t v0h = wasm_u8x16_shr(v0, 4);
2875
2942
 
2876
- static bool x = true;
2877
-
2878
2943
  // add high bit
2879
2944
  const v128_t v0lf = wasm_v128_or(v0l, qhl);
2880
2945
  const v128_t v0hf = wasm_v128_or(v0h, qhh);
@@ -2894,15 +2959,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2894
2959
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2895
2960
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2896
2961
 
2897
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2898
-
2899
2962
  // dot product
2900
- sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2901
- wasm_i32x4_add(
2963
+ sumv = wasm_f32x4_add(sumv,
2964
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2902
2965
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2903
2966
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2904
2967
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2905
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
2968
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2969
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
2906
2970
  }
2907
2971
 
2908
2972
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2924,7 +2988,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2924
2988
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
2925
2989
  bx = _mm256_or_si256(bx, bxhi);
2926
2990
 
2927
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
2991
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2928
2992
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2929
2993
 
2930
2994
  const __m256 q = mul_sum_us8_pairs_float(bx, by);
@@ -2958,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2958
3022
  bxh = _mm_or_si128(bxh, bxhih);
2959
3023
  bx = _mm256_set_m128i(bxh, bxl);
2960
3024
 
2961
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
3025
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2962
3026
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2963
3027
 
2964
3028
  const __m256 q = mul_sum_us8_pairs_float(bx, by);
@@ -3028,11 +3092,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3028
3092
  #if defined(__ARM_FEATURE_DOTPROD)
3029
3093
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3030
3094
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3031
- vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
3095
+ vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3032
3096
 
3033
3097
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3034
3098
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3035
- vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
3099
+ vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3036
3100
 
3037
3101
  #else
3038
3102
  const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
@@ -3050,8 +3114,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3050
3114
  const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
3051
3115
  const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
3052
3116
 
3053
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
3054
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
3117
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3118
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3055
3119
  #endif
3056
3120
  }
3057
3121
 
@@ -3063,7 +3127,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3063
3127
  // Main loop
3064
3128
  for (int i = 0; i < nb; ++i) {
3065
3129
  // Compute combined scale for the block
3066
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
3130
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
3067
3131
  __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
3068
3132
  __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3069
3133
 
@@ -3089,7 +3153,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3089
3153
  sumi += x[i].qs[j]*y[i].qs[j];
3090
3154
  }
3091
3155
 
3092
- sumf += (x[i].d*y[i].d)*sumi;
3156
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3093
3157
  }
3094
3158
 
3095
3159
  *s = sumf;
@@ -3478,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3478
3542
  "ROPE",
3479
3543
  "ROPE_BACK",
3480
3544
  "ALIBI",
3545
+ "CLAMP",
3481
3546
  "CONV_1D_1S",
3482
3547
  "CONV_1D_2S",
3483
3548
 
@@ -3488,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3488
3553
  "MAP_BINARY",
3489
3554
  };
3490
3555
 
3491
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
3556
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3557
+
3492
3558
 
3493
3559
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3494
3560
  "none",
@@ -3538,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3538
3604
  "rope(x)",
3539
3605
  "rope_back(x)",
3540
3606
  "alibi(x)",
3607
+ "clamp(x)",
3541
3608
  "conv_1d_1s(x)",
3542
3609
  "conv_1d_2s(x)",
3543
3610
 
@@ -3548,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3548
3615
  "f(x,y)",
3549
3616
  };
3550
3617
 
3551
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
3618
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3552
3619
 
3553
3620
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3554
3621
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3782,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
3782
3849
  (t1->ne[3]%t0->ne[3] == 0);
3783
3850
  }
3784
3851
 
3852
+ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3853
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3854
+
3855
+ return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
3856
+ }
3857
+
3785
3858
  static inline int ggml_up32(int n) {
3786
3859
  return (n + 31) & ~31;
3787
3860
  }
@@ -4664,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
4664
4737
  struct ggml_tensor * a,
4665
4738
  struct ggml_tensor * b,
4666
4739
  bool inplace) {
4667
- GGML_ASSERT(ggml_are_same_shape(a, b));
4740
+ // TODO: support less-strict constraint
4741
+ // GGML_ASSERT(ggml_can_repeat(b, a));
4742
+ GGML_ASSERT(ggml_can_repeat_rows(b, a));
4668
4743
 
4669
4744
  bool is_node = false;
4670
4745
 
4671
4746
  if (!inplace && (a->grad || b->grad)) {
4747
+ // TODO: support backward pass for broadcasting
4748
+ GGML_ASSERT(ggml_are_same_shape(a, b));
4672
4749
  is_node = true;
4673
4750
  }
4674
4751
 
@@ -6210,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
6210
6287
  struct ggml_context * ctx,
6211
6288
  struct ggml_tensor * a,
6212
6289
  int n_past,
6213
- int n_head) {
6290
+ int n_head,
6291
+ float bias_max) {
6214
6292
  GGML_ASSERT(n_past >= 0);
6215
6293
  bool is_node = false;
6216
6294
 
@@ -6229,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
6229
6307
 
6230
6308
  ((int32_t *) b->data)[0] = n_past;
6231
6309
  ((int32_t *) b->data)[1] = n_head;
6310
+ GGML_ASSERT(sizeof(float) == sizeof(int32_t));
6311
+ (((float *) b->data)[2]) = bias_max;
6232
6312
 
6233
6313
  ggml_scratch_load(ctx);
6234
6314
 
@@ -6240,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
6240
6320
  return result;
6241
6321
  }
6242
6322
 
6323
+ // ggml_clamp
6324
+
6325
+ struct ggml_tensor * ggml_clamp(
6326
+ struct ggml_context * ctx,
6327
+ struct ggml_tensor * a,
6328
+ float min,
6329
+ float max) {
6330
+ bool is_node = false;
6331
+
6332
+ if (a->grad) {
6333
+ GGML_ASSERT(false); // TODO: implement backward
6334
+ is_node = true;
6335
+ }
6336
+
6337
+ // TODO: when implement backward, fix this:
6338
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6339
+
6340
+ ggml_scratch_save(ctx);
6341
+
6342
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6343
+
6344
+ ((float *) b->data)[0] = min;
6345
+ ((float *) b->data)[1] = max;
6346
+
6347
+ ggml_scratch_load(ctx);
6348
+
6349
+ result->op = GGML_OP_CLAMP;
6350
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6351
+ result->src0 = a;
6352
+ result->src1 = b;
6353
+
6354
+ return result;
6355
+ }
6356
+
6243
6357
  // ggml_conv_1d_1s
6244
6358
 
6245
6359
  struct ggml_tensor * ggml_conv_1d_1s(
@@ -7966,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
7966
8080
  const struct ggml_tensor * src0,
7967
8081
  const struct ggml_tensor * src1,
7968
8082
  struct ggml_tensor * dst) {
7969
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8083
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
7970
8084
 
7971
8085
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7972
8086
  return;
@@ -7974,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
7974
8088
  const int ith = params->ith;
7975
8089
  const int nth = params->nth;
7976
8090
 
7977
- const int nr = ggml_nrows(src0);
7978
- const int64_t ne0 = src0->ne[0];
7979
- const int64_t ne1 = src0->ne[1];
7980
- const int64_t ne2 = src0->ne[2];
8091
+ #ifdef GGML_USE_CUBLAS
8092
+ if (src1->backend == GGML_BACKEND_CUDA) {
8093
+ if (ith == 0) {
8094
+ ggml_cuda_mul(src0, src1, dst);
8095
+ }
8096
+ return;
8097
+ }
8098
+ #endif
8099
+
8100
+ const int64_t nr = ggml_nrows(src0);
8101
+
8102
+ const int64_t ne00 = src0->ne[0];
8103
+ const int64_t ne01 = src0->ne[1];
8104
+ const int64_t ne02 = src0->ne[2];
8105
+
8106
+ const int64_t ne10 = src1->ne[0];
8107
+ const int64_t ne11 = src1->ne[1];
8108
+ const int64_t ne12 = src1->ne[2];
8109
+ const int64_t ne13 = src1->ne[3];
7981
8110
 
7982
8111
  const size_t nb00 = src0->nb[0];
7983
8112
  const size_t nb01 = src0->nb[1];
@@ -7996,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
7996
8125
 
7997
8126
  GGML_ASSERT( nb0 == sizeof(float));
7998
8127
  GGML_ASSERT(nb00 == sizeof(float));
8128
+ GGML_ASSERT(ne00 == ne10);
7999
8129
 
8000
8130
  if (nb10 == sizeof(float)) {
8001
- for (int ir = ith; ir < nr; ir += nth) {
8002
- // src0, src1 and dst are same shape => same indices
8003
- const int i3 = ir/(ne2*ne1);
8004
- const int i2 = (ir - i3*ne2*ne1)/ne1;
8005
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
8131
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8132
+ // src0 and dst are same shape => same indices
8133
+ const int64_t i03 = ir/(ne02*ne01);
8134
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8135
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8136
+
8137
+ const int64_t i13 = i03 % ne13;
8138
+ const int64_t i12 = i02 % ne12;
8139
+ const int64_t i11 = i01 % ne11;
8006
8140
 
8141
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8142
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8143
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
8007
8144
 
8008
8145
  #ifdef GGML_USE_ACCELERATE
8009
8146
  UNUSED(ggml_vec_mul_f32);
8010
8147
 
8011
- vDSP_vmul(
8012
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
8013
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
8014
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
8015
- ne0);
8148
+ vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
8016
8149
  #else
8017
- ggml_vec_mul_f32(ne0,
8018
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
8019
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
8020
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
8150
+ ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
8021
8151
  #endif
8022
8152
  // }
8023
8153
  // }
8024
8154
  }
8025
8155
  } else {
8026
8156
  // src1 is not contiguous
8027
- for (int ir = ith; ir < nr; ir += nth) {
8028
- // src0, src1 and dst are same shape => same indices
8029
- const int i3 = ir/(ne2*ne1);
8030
- const int i2 = (ir - i3*ne2*ne1)/ne1;
8031
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
8157
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8158
+ // src0 and dst are same shape => same indices
8159
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
8160
+ const int64_t i03 = ir/(ne02*ne01);
8161
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8162
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8032
8163
 
8033
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
8034
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
8035
- for (int i0 = 0; i0 < ne0; i0++) {
8036
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
8164
+ const int64_t i13 = i03 % ne13;
8165
+ const int64_t i12 = i02 % ne12;
8166
+ const int64_t i11 = i01 % ne11;
8167
+
8168
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8169
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8170
+
8171
+ for (int64_t i0 = 0; i0 < ne00; i0++) {
8172
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
8037
8173
 
8038
8174
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
8039
8175
  }
@@ -10527,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
10527
10663
 
10528
10664
  const int n_past = ((int32_t *) src1->data)[0];
10529
10665
  const bool inplace = (bool)((int32_t *) src1->data)[1];
10666
+
10530
10667
  assert(n_past >= 0);
10531
10668
 
10532
10669
  if (!inplace && (params->type == GGML_TASK_INIT)) {
@@ -10697,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
10697
10834
  struct ggml_tensor * dst) {
10698
10835
  assert(params->ith == 0);
10699
10836
  assert(src1->type == GGML_TYPE_I32);
10700
- assert(ggml_nelements(src1) == 2);
10837
+ assert(ggml_nelements(src1) == 3);
10701
10838
 
10702
10839
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10703
10840
  return;
10704
10841
  }
10705
10842
 
10706
- const int n_past = ((int32_t *) src1->data)[0];
10707
- const int n_head = ((int32_t *) src1->data)[1];
10843
+ const int n_past = ((int32_t *) src1->data)[0];
10844
+ const int n_head = ((int32_t *) src1->data)[1];
10845
+ const float max_bias = ((float *) src1->data)[2];
10708
10846
 
10709
10847
  assert(n_past >= 0);
10710
10848
 
@@ -10727,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
10727
10865
  // add alibi to src0 (KQ_scaled)
10728
10866
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10729
10867
 
10730
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10731
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10868
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10869
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10732
10870
 
10733
10871
  for (int i = 0; i < ne0; i++) {
10734
10872
  for (int j = 0; j < ne1; j++) {
@@ -10746,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
10746
10884
  m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
10747
10885
  }
10748
10886
 
10749
- pdst[0] = i * m_k + src[0];
10887
+ pdst[0] = (i-ne0+1) * m_k + src[0];
10888
+
10750
10889
  }
10751
10890
  }
10752
10891
  }
10753
10892
  }
10754
10893
 
10755
-
10756
10894
  static void ggml_compute_forward_alibi_f16(
10757
10895
  const struct ggml_compute_params * params,
10758
10896
  const struct ggml_tensor * src0,
@@ -10760,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
10760
10898
  struct ggml_tensor * dst) {
10761
10899
  assert(params->ith == 0);
10762
10900
  assert(src1->type == GGML_TYPE_I32);
10763
- assert(ggml_nelements(src1) == 2);
10901
+ assert(ggml_nelements(src1) == 3);
10764
10902
 
10765
10903
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10766
10904
  return;
10767
10905
  }
10768
10906
 
10769
- const int n_past = ((int32_t *) src1->data)[0];
10770
- const int n_head = ((int32_t *) src1->data)[1];
10907
+ const int n_past = ((int32_t *) src1->data)[0];
10908
+ const int n_head = ((int32_t *) src1->data)[1];
10909
+ const float max_bias = ((float *) src1->data)[2];
10771
10910
 
10772
10911
  assert(n_past >= 0);
10773
10912
 
@@ -10790,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
10790
10929
  // add alibi to src0 (KQ_scaled)
10791
10930
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10792
10931
 
10793
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10794
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10932
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10933
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10795
10934
 
10796
10935
  for (int i = 0; i < ne0; i++) {
10797
10936
  for (int j = 0; j < ne1; j++) {
@@ -10810,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
10810
10949
  }
10811
10950
 
10812
10951
  // we return F32
10813
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
10952
+ pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
10814
10953
  }
10815
10954
  }
10816
10955
  }
@@ -10846,6 +10985,77 @@ static void ggml_compute_forward_alibi(
10846
10985
  }
10847
10986
  }
10848
10987
 
10988
+
10989
+ // ggml_compute_forward_clamp
10990
+
10991
+ static void ggml_compute_forward_clamp_f32(
10992
+ const struct ggml_compute_params * params,
10993
+ const struct ggml_tensor * src0,
10994
+ const struct ggml_tensor * src1,
10995
+ struct ggml_tensor * dst) {
10996
+ assert(params->ith == 0);
10997
+ assert(src1->type == GGML_TYPE_I32);
10998
+ assert(ggml_nelements(src1) == 2);
10999
+
11000
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11001
+ return;
11002
+ }
11003
+
11004
+ const int min = ((float *) src1->data)[0];
11005
+ const int max = ((float *) src1->data)[1];
11006
+
11007
+ const int ith = params->ith;
11008
+ const int nth = params->nth;
11009
+
11010
+ const int n = ggml_nrows(src0);
11011
+ const int nc = src0->ne[0];
11012
+
11013
+ const size_t nb00 = src0->nb[0];
11014
+ const size_t nb01 = src0->nb[1];
11015
+
11016
+ const size_t nb0 = dst->nb[0];
11017
+ const size_t nb1 = dst->nb[1];
11018
+
11019
+ GGML_ASSERT( nb0 == sizeof(float));
11020
+ GGML_ASSERT(nb00 == sizeof(float));
11021
+
11022
+ for (int j = ith; j < n; j += nth) {
11023
+ float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
11024
+ float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
11025
+
11026
+ for (int i = 0; i < nc; i++) {
11027
+ dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
11028
+ }
11029
+ }
11030
+ }
11031
+
11032
+ static void ggml_compute_forward_clamp(
11033
+ const struct ggml_compute_params * params,
11034
+ const struct ggml_tensor * src0,
11035
+ const struct ggml_tensor * src1,
11036
+ struct ggml_tensor * dst) {
11037
+ switch (src0->type) {
11038
+ case GGML_TYPE_F32:
11039
+ {
11040
+ ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11041
+ } break;
11042
+ case GGML_TYPE_F16:
11043
+ case GGML_TYPE_Q4_0:
11044
+ case GGML_TYPE_Q4_1:
11045
+ case GGML_TYPE_Q5_0:
11046
+ case GGML_TYPE_Q5_1:
11047
+ case GGML_TYPE_Q8_0:
11048
+ case GGML_TYPE_Q8_1:
11049
+ case GGML_TYPE_I8:
11050
+ case GGML_TYPE_I16:
11051
+ case GGML_TYPE_I32:
11052
+ case GGML_TYPE_COUNT:
11053
+ {
11054
+ GGML_ASSERT(false);
11055
+ } break;
11056
+ }
11057
+ }
11058
+
10849
11059
  // ggml_compute_forward_rope
10850
11060
 
10851
11061
  static void ggml_compute_forward_rope_f32(
@@ -12827,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
12827
13037
  {
12828
13038
  ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
12829
13039
  } break;
13040
+ case GGML_OP_CLAMP:
13041
+ {
13042
+ ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
13043
+ } break;
12830
13044
  case GGML_OP_CONV_1D_1S:
12831
13045
  {
12832
13046
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -13134,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
13134
13348
  {
13135
13349
  GGML_ASSERT(false); // TODO: not implemented
13136
13350
  } break;
13351
+ case GGML_OP_CLAMP:
13352
+ {
13353
+ GGML_ASSERT(false); // TODO: not implemented
13354
+ } break;
13137
13355
  case GGML_OP_SILU:
13138
13356
  {
13139
13357
  // necessary for llama
@@ -14013,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14013
14231
  {
14014
14232
  node->n_tasks = 1; //TODO
14015
14233
  } break;
14234
+ case GGML_OP_CLAMP:
14235
+ {
14236
+ node->n_tasks = 1; //TODO
14237
+ } break;
14016
14238
  case GGML_OP_CONV_1D_1S:
14017
14239
  case GGML_OP_CONV_1D_2S:
14018
14240
  {
@@ -14409,9 +14631,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
14409
14631
  fprintf(fp, "%s |", node->name);
14410
14632
  }
14411
14633
 
14412
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
14413
- i, node->ne[0], node->ne[1],
14414
- GGML_OP_SYMBOL[node->op]);
14634
+ if (node->n_dims == 2) {
14635
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
14636
+ } else {
14637
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
14638
+ }
14639
+
14415
14640
 
14416
14641
  if (node->grad) {
14417
14642
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);