llama_cpp 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
512
512
  return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
513
513
  }
514
514
 
515
- #if __AVX2__ || __AVX512F__
515
+ #if defined(__AVX2__) || defined(__AVX512F__)
516
516
  // spread 32 bits to 32 bytes { 0x00, 0xFF }
517
517
  static inline __m256i bytes_from_bits_32(const uint8_t * x) {
518
518
  uint32_t x32;
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
688
688
  #endif // __AVX__ || __AVX2__ || __AVX512F__
689
689
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
690
690
 
691
- #if __ARM_NEON
691
+ #if defined(__ARM_NEON)
692
692
 
693
693
  #if !defined(__aarch64__)
694
694
 
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740
740
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
741
741
  }
742
742
 
743
- float vminvq_f32(float32x4_t v) {
743
+ inline static float vminvq_f32(float32x4_t v) {
744
744
  return
745
745
  MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
746
746
  MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
747
747
  }
748
748
 
749
- float vmaxvq_f32(float32x4_t v) {
749
+ inline static float vmaxvq_f32(float32x4_t v) {
750
750
  return
751
751
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
752
752
  MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
753
753
  }
754
754
 
755
- int32x4_t vcvtnq_s32_f32(float32x4_t v) {
755
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
756
756
  int32x4_t res;
757
757
 
758
758
  res[0] = roundf(vgetq_lane_f32(v, 0));
@@ -766,21 +766,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
766
  #endif
767
767
  #endif
768
768
 
769
-
770
769
  #define QK4_0 32
771
770
  typedef struct {
772
- float d; // delta
771
+ ggml_fp16_t d; // delta
773
772
  uint8_t qs[QK4_0 / 2]; // nibbles / quants
774
773
  } block_q4_0;
775
- static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
774
+ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
776
775
 
777
776
  #define QK4_1 32
778
777
  typedef struct {
779
- float d; // delta
780
- float m; // min
778
+ ggml_fp16_t d; // delta
779
+ ggml_fp16_t m; // min
781
780
  uint8_t qs[QK4_1 / 2]; // nibbles / quants
782
781
  } block_q4_1;
783
- static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
782
+ static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
784
783
 
785
784
  #define QK5_0 32
786
785
  typedef struct {
@@ -801,16 +800,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
801
800
 
802
801
  #define QK8_0 32
803
802
  typedef struct {
804
- float d; // delta
805
- int8_t qs[QK8_0]; // quants
803
+ ggml_fp16_t d; // delta
804
+ int8_t qs[QK8_0]; // quants
806
805
  } block_q8_0;
807
- static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
806
+ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
808
807
 
809
808
  #define QK8_1 32
810
809
  typedef struct {
811
- float d; // delta
812
- float s; // d * sum(qs[i])
813
- int8_t qs[QK8_1]; // quants
810
+ float d; // delta
811
+ float s; // d * sum(qs[i])
812
+ int8_t qs[QK8_1]; // quants
814
813
  } block_q8_1;
815
814
  static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
816
815
 
@@ -837,7 +836,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
837
836
  const float d = max / -8;
838
837
  const float id = d ? 1.0f/d : 0.0f;
839
838
 
840
- y[i].d = d;
839
+ y[i].d = GGML_FP32_TO_FP16(d);
841
840
 
842
841
  for (int j = 0; j < qk/2; ++j) {
843
842
  const float x0 = x[i*qk + 0 + j]*id;
@@ -877,8 +876,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
877
876
  const float d = (max - min) / ((1 << 4) - 1);
878
877
  const float id = d ? 1.0f/d : 0.0f;
879
878
 
880
- y[i].d = d;
881
- y[i].m = min;
879
+ y[i].d = GGML_FP32_TO_FP16(d);
880
+ y[i].m = GGML_FP32_TO_FP16(min);
882
881
 
883
882
  for (int j = 0; j < qk/2; ++j) {
884
883
  const float x0 = (x[i*qk + 0 + j] - min)*id;
@@ -1009,7 +1008,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
1009
1008
  const float d = amax / ((1 << 7) - 1);
1010
1009
  const float id = d ? 1.0f/d : 0.0f;
1011
1010
 
1012
- y[i].d = d;
1011
+ y[i].d = GGML_FP32_TO_FP16(d);
1013
1012
 
1014
1013
  for (int j = 0; j < QK8_0; ++j) {
1015
1014
  const float x0 = x[i*QK8_0 + j]*id;
@@ -1044,7 +1043,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1044
1043
  const float d = amax / ((1 << 7) - 1);
1045
1044
  const float id = d ? 1.0f/d : 0.0f;
1046
1045
 
1047
- y[i].d = d;
1046
+ y[i].d = GGML_FP32_TO_FP16(d);
1048
1047
 
1049
1048
  for (int j = 0; j < 8; j++) {
1050
1049
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1056
1055
  y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1057
1056
  }
1058
1057
  }
1058
+ #elif defined(__wasm_simd128__)
1059
+ for (int i = 0; i < nb; i++) {
1060
+ v128_t srcv [8];
1061
+ v128_t asrcv[8];
1062
+ v128_t amaxv[8];
1063
+
1064
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1065
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1066
+
1067
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1068
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1069
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1070
+
1071
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1072
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1073
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1074
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1075
+
1076
+ const float d = amax / ((1 << 7) - 1);
1077
+ const float id = d ? 1.0f/d : 0.0f;
1078
+
1079
+ y[i].d = GGML_FP32_TO_FP16(d);
1080
+
1081
+ for (int j = 0; j < 8; j++) {
1082
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1083
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1084
+
1085
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1086
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1087
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1088
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1089
+ }
1090
+ }
1059
1091
  #elif defined(__AVX2__) || defined(__AVX__)
1060
1092
  for (int i = 0; i < nb; i++) {
1061
1093
  // Load elements into 4 AVX vectors
@@ -1079,7 +1111,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1079
1111
 
1080
1112
  // Quantize these floats
1081
1113
  const float d = maxScalar / 127.f;
1082
- y[i].d = d;
1114
+ y[i].d = GGML_FP32_TO_FP16(d);
1083
1115
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1084
1116
  const __m256 mul = _mm256_set1_ps( id );
1085
1117
 
@@ -1178,7 +1210,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
1178
1210
  sum += y[i].qs[QK8_1/2 + j];
1179
1211
  }
1180
1212
 
1181
- y[i].s = d * sum;
1213
+ y[i].s = sum*d;
1182
1214
  }
1183
1215
  }
1184
1216
 
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1224
1256
 
1225
1257
  y[i].s = d * vaddvq_s32(accv);
1226
1258
  }
1259
+ #elif defined(__wasm_simd128__)
1260
+ for (int i = 0; i < nb; i++) {
1261
+ v128_t srcv [8];
1262
+ v128_t asrcv[8];
1263
+ v128_t amaxv[8];
1264
+
1265
+ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
1266
+ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
1267
+
1268
+ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
1269
+ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
1270
+ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
1271
+
1272
+ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
1273
+ wasm_f32x4_extract_lane(amaxv[0], 1)),
1274
+ MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
1275
+ wasm_f32x4_extract_lane(amaxv[0], 3)));
1276
+
1277
+ const float d = amax / ((1 << 7) - 1);
1278
+ const float id = d ? 1.0f/d : 0.0f;
1279
+
1280
+ y[i].d = d;
1281
+
1282
+ v128_t accv = wasm_i32x4_splat(0);
1283
+
1284
+ for (int j = 0; j < 8; j++) {
1285
+ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
1286
+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
1287
+
1288
+ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
1289
+ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
1290
+ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
1291
+ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
1292
+
1293
+ accv = wasm_i32x4_add(accv, vi);
1294
+ }
1295
+
1296
+ y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
1297
+ wasm_i32x4_extract_lane(accv, 1) +
1298
+ wasm_i32x4_extract_lane(accv, 2) +
1299
+ wasm_i32x4_extract_lane(accv, 3));
1300
+ }
1227
1301
  #elif defined(__AVX2__) || defined(__AVX__)
1228
1302
  for (int i = 0; i < nb; i++) {
1229
1303
  // Load elements into 4 AVX vectors
@@ -1330,7 +1404,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1330
1404
  const int nb = k / qk;
1331
1405
 
1332
1406
  for (int i = 0; i < nb; i++) {
1333
- const float d = x[i].d;
1407
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1334
1408
 
1335
1409
  for (int j = 0; j < qk/2; ++j) {
1336
1410
  const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1350,8 +1424,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
1350
1424
  const int nb = k / qk;
1351
1425
 
1352
1426
  for (int i = 0; i < nb; i++) {
1353
- const float d = x[i].d;
1354
- const float m = x[i].m;
1427
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1428
+ const float m = GGML_FP16_TO_FP32(x[i].m);
1355
1429
 
1356
1430
  for (int j = 0; j < qk/2; ++j) {
1357
1431
  const int x0 = (x[i].qs[j] & 0x0F);
@@ -1426,7 +1500,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1426
1500
  const block_q8_0 * restrict x = vx;
1427
1501
 
1428
1502
  for (int i = 0; i < nb; i++) {
1429
- const float d = x[i].d;
1503
+ const float d = GGML_FP16_TO_FP32(x[i].d);
1430
1504
 
1431
1505
  for (int j = 0; j < qk; ++j) {
1432
1506
  y[i*qk + j] = x[i].qs[j]*d;
@@ -1690,8 +1764,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1690
1764
  static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
1691
1765
  float tmp[8];
1692
1766
 
1693
- for (int i = 0; i < 8; i++)
1767
+ for (int i = 0; i < 8; i++) {
1694
1768
  tmp[i] = GGML_FP16_TO_FP32(x[i]);
1769
+ }
1695
1770
 
1696
1771
  return _mm256_loadu_ps(tmp);
1697
1772
  }
@@ -2111,8 +2186,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2111
2186
  const block_q8_0 * restrict y0 = &y[i + 0];
2112
2187
  const block_q8_0 * restrict y1 = &y[i + 1];
2113
2188
 
2114
- const uint8x16_t m4b = vdupq_n_u8(0x0F);
2115
- const int8x16_t s8b = vdupq_n_s8(0x8);
2189
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
2190
+ const int8x16_t s8b = vdupq_n_s8(0x8);
2116
2191
 
2117
2192
  const uint8x16_t v0_0 = vld1q_u8(x0->qs);
2118
2193
  const uint8x16_t v0_1 = vld1q_u8(x1->qs);
@@ -2140,8 +2215,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2140
2215
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2141
2216
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2142
2217
 
2143
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2144
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2218
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2219
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2145
2220
  #else
2146
2221
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2147
2222
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2158,8 +2233,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2158
2233
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2159
2234
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2160
2235
 
2161
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2162
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2236
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2237
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2163
2238
  #endif
2164
2239
  }
2165
2240
 
@@ -2171,7 +2246,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2171
2246
  // Main loop
2172
2247
  for (int i = 0; i < nb; ++i) {
2173
2248
  /* Compute combined scale for the block */
2174
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2249
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2175
2250
 
2176
2251
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2177
2252
 
@@ -2195,7 +2270,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2195
2270
  // Main loop
2196
2271
  for (int i = 0; i < nb; ++i) {
2197
2272
  // Compute combined scale for the block
2198
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
2273
+ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2199
2274
 
2200
2275
  const __m128i lowMask = _mm_set1_epi8(0xF);
2201
2276
  const __m128i off = _mm_set1_epi8(8);
@@ -2237,7 +2312,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2237
2312
  _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
2238
2313
 
2239
2314
  // Compute combined scale for the block 0 and 1
2240
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
2315
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
2241
2316
 
2242
2317
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
2243
2318
 
@@ -2255,7 +2330,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2255
2330
  _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
2256
2331
 
2257
2332
  // Compute combined scale for the block 2 and 3
2258
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
2333
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
2259
2334
 
2260
2335
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
2261
2336
 
@@ -2288,7 +2363,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2288
2363
  _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
2289
2364
 
2290
2365
  // Compute combined scale for the block 0 and 1
2291
- const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
2366
+ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
2292
2367
 
2293
2368
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
2294
2369
 
@@ -2306,7 +2381,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2306
2381
  _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2307
2382
 
2308
2383
  // Compute combined scale for the block 2 and 3
2309
- const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
2384
+ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
2310
2385
 
2311
2386
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
2312
2387
 
@@ -2354,7 +2429,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2354
2429
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2355
2430
  }
2356
2431
 
2357
- sumf += (x[i].d*y[i].d)*sumi;
2432
+ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2358
2433
  }
2359
2434
 
2360
2435
  *s = sumf;
@@ -2384,7 +2459,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2384
2459
  const block_q8_1 * restrict y0 = &y[i + 0];
2385
2460
  const block_q8_1 * restrict y1 = &y[i + 1];
2386
2461
 
2387
- summs += x0->m * y0->s + x1->m * y1->s;
2462
+ summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
2388
2463
 
2389
2464
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
2390
2465
 
@@ -2408,8 +2483,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2408
2483
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2409
2484
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2410
2485
 
2411
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2412
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2486
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
2487
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
2413
2488
  #else
2414
2489
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
2415
2490
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
@@ -2426,8 +2501,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2426
2501
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2427
2502
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2428
2503
 
2429
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
2430
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
2504
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2505
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2431
2506
  #endif
2432
2507
  }
2433
2508
 
@@ -2440,13 +2515,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2440
2515
 
2441
2516
  // Main loop
2442
2517
  for (int i = 0; i < nb; ++i) {
2443
- const float * d0 = &x[i].d;
2444
- const float * d1 = &y[i].d;
2518
+ const float d0 = GGML_FP16_TO_FP32(x[i].d);
2519
+ const float d1 = y[i].d;
2445
2520
 
2446
- summs += x[i].m * y[i].s;
2521
+ summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
2447
2522
 
2448
- const __m256 d0v = _mm256_broadcast_ss( d0 );
2449
- const __m256 d1v = _mm256_broadcast_ss( d1 );
2523
+ const __m256 d0v = _mm256_set1_ps( d0 );
2524
+ const __m256 d1v = _mm256_set1_ps( d1 );
2450
2525
 
2451
2526
  // Compute combined scales
2452
2527
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
@@ -2480,7 +2555,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2480
2555
  sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
2481
2556
  }
2482
2557
 
2483
- sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
2558
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2484
2559
  }
2485
2560
 
2486
2561
  *s = sumf;
@@ -2556,16 +2631,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2556
2631
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2557
2632
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2558
2633
 
2559
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2560
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2561
-
2562
2634
  #if defined(__ARM_FEATURE_DOTPROD)
2563
2635
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2564
2636
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2565
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2637
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2566
2638
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2567
2639
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2568
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2640
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2569
2641
  #else
2570
2642
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2571
2643
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2582,8 +2654,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2582
2654
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2583
2655
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2584
2656
 
2585
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2586
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2657
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2658
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2587
2659
  #endif
2588
2660
  }
2589
2661
 
@@ -2600,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2600
2672
  const block_q8_0 * restrict y0 = &y[i];
2601
2673
 
2602
2674
  const v128_t m4b = wasm_i8x16_splat(0x0F);
2603
- const v128_t s16b = wasm_i8x16_splat(0x10);
2604
2675
 
2605
2676
  // extract the 5th bit
2606
2677
  memcpy(&qh, x0->qh, sizeof(qh));
@@ -2638,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2638
2709
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2639
2710
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2640
2711
 
2641
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2642
-
2643
2712
  // dot product
2644
2713
  sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2645
2714
  wasm_i32x4_add(
2646
2715
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2647
2716
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2648
2717
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2649
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
2718
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2719
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
2650
2720
  }
2651
2721
 
2652
2722
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2658,7 +2728,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2658
2728
  // Main loop
2659
2729
  for (int i = 0; i < nb; i++) {
2660
2730
  /* Compute combined scale for the block */
2661
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2731
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2662
2732
 
2663
2733
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2664
2734
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2682,7 +2752,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2682
2752
  // Main loop
2683
2753
  for (int i = 0; i < nb; i++) {
2684
2754
  /* Compute combined scale for the block */
2685
- const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
2755
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
2686
2756
 
2687
2757
  __m256i bx = bytes_from_nibbles_32(x[i].qs);
2688
2758
  const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2725,7 +2795,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2725
2795
  sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
2726
2796
  }
2727
2797
 
2728
- sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
2798
+ sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
2729
2799
  }
2730
2800
 
2731
2801
  *s = sumf;
@@ -2807,16 +2877,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2807
2877
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2808
2878
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2809
2879
 
2810
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2811
- const float x1d = GGML_FP16_TO_FP32(x1->d);
2812
-
2813
2880
  #if defined(__ARM_FEATURE_DOTPROD)
2814
2881
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2815
2882
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2816
- vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
2883
+ vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
2817
2884
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2818
2885
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2819
- vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
2886
+ vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
2820
2887
  #else
2821
2888
  const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2822
2889
  const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -2833,8 +2900,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2833
2900
  const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2834
2901
  const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2835
2902
 
2836
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
2837
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
2903
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2904
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2838
2905
  #endif
2839
2906
  }
2840
2907
 
@@ -2873,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2873
2940
  const v128_t v0l = wasm_v128_and (v0, m4b);
2874
2941
  const v128_t v0h = wasm_u8x16_shr(v0, 4);
2875
2942
 
2876
- static bool x = true;
2877
-
2878
2943
  // add high bit
2879
2944
  const v128_t v0lf = wasm_v128_or(v0l, qhl);
2880
2945
  const v128_t v0hf = wasm_v128_or(v0h, qhh);
@@ -2894,15 +2959,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2894
2959
  const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
2895
2960
  const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
2896
2961
 
2897
- const float x0d = GGML_FP16_TO_FP32(x0->d);
2898
-
2899
2962
  // dot product
2900
- sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
2901
- wasm_i32x4_add(
2963
+ sumv = wasm_f32x4_add(sumv,
2964
+ wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
2902
2965
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
2903
2966
  wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
2904
2967
  wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
2905
- wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
2968
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
2969
+ wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
2906
2970
  }
2907
2971
 
2908
2972
  *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2924,7 +2988,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2924
2988
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
2925
2989
  bx = _mm256_or_si256(bx, bxhi);
2926
2990
 
2927
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
2991
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2928
2992
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2929
2993
 
2930
2994
  const __m256 q = mul_sum_us8_pairs_float(bx, by);
@@ -2958,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2958
3022
  bxh = _mm_or_si128(bxh, bxhih);
2959
3023
  bx = _mm256_set_m128i(bxh, bxl);
2960
3024
 
2961
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
3025
+ const __m256 dy = _mm256_set1_ps(y[i].d);
2962
3026
  const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
2963
3027
 
2964
3028
  const __m256 q = mul_sum_us8_pairs_float(bx, by);
@@ -3028,11 +3092,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3028
3092
  #if defined(__ARM_FEATURE_DOTPROD)
3029
3093
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3030
3094
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3031
- vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
3095
+ vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3032
3096
 
3033
3097
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3034
3098
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3035
- vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
3099
+ vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3036
3100
 
3037
3101
  #else
3038
3102
  const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
@@ -3050,8 +3114,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3050
3114
  const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
3051
3115
  const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
3052
3116
 
3053
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
3054
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
3117
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3118
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3055
3119
  #endif
3056
3120
  }
3057
3121
 
@@ -3063,7 +3127,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3063
3127
  // Main loop
3064
3128
  for (int i = 0; i < nb; ++i) {
3065
3129
  // Compute combined scale for the block
3066
- const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
3130
+ const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
3067
3131
  __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
3068
3132
  __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3069
3133
 
@@ -3089,7 +3153,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
3089
3153
  sumi += x[i].qs[j]*y[i].qs[j];
3090
3154
  }
3091
3155
 
3092
- sumf += (x[i].d*y[i].d)*sumi;
3156
+ sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
3093
3157
  }
3094
3158
 
3095
3159
  *s = sumf;
@@ -3478,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3478
3542
  "ROPE",
3479
3543
  "ROPE_BACK",
3480
3544
  "ALIBI",
3545
+ "CLAMP",
3481
3546
  "CONV_1D_1S",
3482
3547
  "CONV_1D_2S",
3483
3548
 
@@ -3488,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3488
3553
  "MAP_BINARY",
3489
3554
  };
3490
3555
 
3491
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
3556
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3557
+
3492
3558
 
3493
3559
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3494
3560
  "none",
@@ -3538,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3538
3604
  "rope(x)",
3539
3605
  "rope_back(x)",
3540
3606
  "alibi(x)",
3607
+ "clamp(x)",
3541
3608
  "conv_1d_1s(x)",
3542
3609
  "conv_1d_2s(x)",
3543
3610
 
@@ -3548,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3548
3615
  "f(x,y)",
3549
3616
  };
3550
3617
 
3551
- static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
3618
+ static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
3552
3619
 
3553
3620
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3554
3621
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3782,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
3782
3849
  (t1->ne[3]%t0->ne[3] == 0);
3783
3850
  }
3784
3851
 
3852
+ static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3853
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3854
+
3855
+ return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
3856
+ }
3857
+
3785
3858
  static inline int ggml_up32(int n) {
3786
3859
  return (n + 31) & ~31;
3787
3860
  }
@@ -4664,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
4664
4737
  struct ggml_tensor * a,
4665
4738
  struct ggml_tensor * b,
4666
4739
  bool inplace) {
4667
- GGML_ASSERT(ggml_are_same_shape(a, b));
4740
+ // TODO: support less-strict constraint
4741
+ // GGML_ASSERT(ggml_can_repeat(b, a));
4742
+ GGML_ASSERT(ggml_can_repeat_rows(b, a));
4668
4743
 
4669
4744
  bool is_node = false;
4670
4745
 
4671
4746
  if (!inplace && (a->grad || b->grad)) {
4747
+ // TODO: support backward pass for broadcasting
4748
+ GGML_ASSERT(ggml_are_same_shape(a, b));
4672
4749
  is_node = true;
4673
4750
  }
4674
4751
 
@@ -6210,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
6210
6287
  struct ggml_context * ctx,
6211
6288
  struct ggml_tensor * a,
6212
6289
  int n_past,
6213
- int n_head) {
6290
+ int n_head,
6291
+ float bias_max) {
6214
6292
  GGML_ASSERT(n_past >= 0);
6215
6293
  bool is_node = false;
6216
6294
 
@@ -6229,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
6229
6307
 
6230
6308
  ((int32_t *) b->data)[0] = n_past;
6231
6309
  ((int32_t *) b->data)[1] = n_head;
6310
+ GGML_ASSERT(sizeof(float) == sizeof(int32_t));
6311
+ (((float *) b->data)[2]) = bias_max;
6232
6312
 
6233
6313
  ggml_scratch_load(ctx);
6234
6314
 
@@ -6240,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
6240
6320
  return result;
6241
6321
  }
6242
6322
 
6323
+ // ggml_clamp
6324
+
6325
+ struct ggml_tensor * ggml_clamp(
6326
+ struct ggml_context * ctx,
6327
+ struct ggml_tensor * a,
6328
+ float min,
6329
+ float max) {
6330
+ bool is_node = false;
6331
+
6332
+ if (a->grad) {
6333
+ GGML_ASSERT(false); // TODO: implement backward
6334
+ is_node = true;
6335
+ }
6336
+
6337
+ // TODO: when implement backward, fix this:
6338
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6339
+
6340
+ ggml_scratch_save(ctx);
6341
+
6342
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6343
+
6344
+ ((float *) b->data)[0] = min;
6345
+ ((float *) b->data)[1] = max;
6346
+
6347
+ ggml_scratch_load(ctx);
6348
+
6349
+ result->op = GGML_OP_CLAMP;
6350
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6351
+ result->src0 = a;
6352
+ result->src1 = b;
6353
+
6354
+ return result;
6355
+ }
6356
+
6243
6357
  // ggml_conv_1d_1s
6244
6358
 
6245
6359
  struct ggml_tensor * ggml_conv_1d_1s(
@@ -7966,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
7966
8080
  const struct ggml_tensor * src0,
7967
8081
  const struct ggml_tensor * src1,
7968
8082
  struct ggml_tensor * dst) {
7969
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
8083
+ GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
7970
8084
 
7971
8085
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
7972
8086
  return;
@@ -7974,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
7974
8088
  const int ith = params->ith;
7975
8089
  const int nth = params->nth;
7976
8090
 
7977
- const int nr = ggml_nrows(src0);
7978
- const int64_t ne0 = src0->ne[0];
7979
- const int64_t ne1 = src0->ne[1];
7980
- const int64_t ne2 = src0->ne[2];
8091
+ #ifdef GGML_USE_CUBLAS
8092
+ if (src1->backend == GGML_BACKEND_CUDA) {
8093
+ if (ith == 0) {
8094
+ ggml_cuda_mul(src0, src1, dst);
8095
+ }
8096
+ return;
8097
+ }
8098
+ #endif
8099
+
8100
+ const int64_t nr = ggml_nrows(src0);
8101
+
8102
+ const int64_t ne00 = src0->ne[0];
8103
+ const int64_t ne01 = src0->ne[1];
8104
+ const int64_t ne02 = src0->ne[2];
8105
+
8106
+ const int64_t ne10 = src1->ne[0];
8107
+ const int64_t ne11 = src1->ne[1];
8108
+ const int64_t ne12 = src1->ne[2];
8109
+ const int64_t ne13 = src1->ne[3];
7981
8110
 
7982
8111
  const size_t nb00 = src0->nb[0];
7983
8112
  const size_t nb01 = src0->nb[1];
@@ -7996,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
7996
8125
 
7997
8126
  GGML_ASSERT( nb0 == sizeof(float));
7998
8127
  GGML_ASSERT(nb00 == sizeof(float));
8128
+ GGML_ASSERT(ne00 == ne10);
7999
8129
 
8000
8130
  if (nb10 == sizeof(float)) {
8001
- for (int ir = ith; ir < nr; ir += nth) {
8002
- // src0, src1 and dst are same shape => same indices
8003
- const int i3 = ir/(ne2*ne1);
8004
- const int i2 = (ir - i3*ne2*ne1)/ne1;
8005
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
8131
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8132
+ // src0 and dst are same shape => same indices
8133
+ const int64_t i03 = ir/(ne02*ne01);
8134
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8135
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8136
+
8137
+ const int64_t i13 = i03 % ne13;
8138
+ const int64_t i12 = i02 % ne12;
8139
+ const int64_t i11 = i01 % ne11;
8006
8140
 
8141
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8142
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8143
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
8007
8144
 
8008
8145
  #ifdef GGML_USE_ACCELERATE
8009
8146
  UNUSED(ggml_vec_mul_f32);
8010
8147
 
8011
- vDSP_vmul(
8012
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
8013
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
8014
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
8015
- ne0);
8148
+ vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
8016
8149
  #else
8017
- ggml_vec_mul_f32(ne0,
8018
- (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
8019
- (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
8020
- (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
8150
+ ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
8021
8151
  #endif
8022
8152
  // }
8023
8153
  // }
8024
8154
  }
8025
8155
  } else {
8026
8156
  // src1 is not contiguous
8027
- for (int ir = ith; ir < nr; ir += nth) {
8028
- // src0, src1 and dst are same shape => same indices
8029
- const int i3 = ir/(ne2*ne1);
8030
- const int i2 = (ir - i3*ne2*ne1)/ne1;
8031
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
8157
+ for (int64_t ir = ith; ir < nr; ir += nth) {
8158
+ // src0 and dst are same shape => same indices
8159
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
8160
+ const int64_t i03 = ir/(ne02*ne01);
8161
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
8162
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
8032
8163
 
8033
- float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
8034
- float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
8035
- for (int i0 = 0; i0 < ne0; i0++) {
8036
- float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
8164
+ const int64_t i13 = i03 % ne13;
8165
+ const int64_t i12 = i02 % ne12;
8166
+ const int64_t i11 = i01 % ne11;
8167
+
8168
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
8169
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
8170
+
8171
+ for (int64_t i0 = 0; i0 < ne00; i0++) {
8172
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
8037
8173
 
8038
8174
  dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
8039
8175
  }
@@ -10527,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
10527
10663
 
10528
10664
  const int n_past = ((int32_t *) src1->data)[0];
10529
10665
  const bool inplace = (bool)((int32_t *) src1->data)[1];
10666
+
10530
10667
  assert(n_past >= 0);
10531
10668
 
10532
10669
  if (!inplace && (params->type == GGML_TASK_INIT)) {
@@ -10697,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
10697
10834
  struct ggml_tensor * dst) {
10698
10835
  assert(params->ith == 0);
10699
10836
  assert(src1->type == GGML_TYPE_I32);
10700
- assert(ggml_nelements(src1) == 2);
10837
+ assert(ggml_nelements(src1) == 3);
10701
10838
 
10702
10839
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10703
10840
  return;
10704
10841
  }
10705
10842
 
10706
- const int n_past = ((int32_t *) src1->data)[0];
10707
- const int n_head = ((int32_t *) src1->data)[1];
10843
+ const int n_past = ((int32_t *) src1->data)[0];
10844
+ const int n_head = ((int32_t *) src1->data)[1];
10845
+ const float max_bias = ((float *) src1->data)[2];
10708
10846
 
10709
10847
  assert(n_past >= 0);
10710
10848
 
@@ -10727,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
10727
10865
  // add alibi to src0 (KQ_scaled)
10728
10866
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10729
10867
 
10730
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10731
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10868
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10869
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10732
10870
 
10733
10871
  for (int i = 0; i < ne0; i++) {
10734
10872
  for (int j = 0; j < ne1; j++) {
@@ -10746,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
10746
10884
  m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
10747
10885
  }
10748
10886
 
10749
- pdst[0] = i * m_k + src[0];
10887
+ pdst[0] = (i-ne0+1) * m_k + src[0];
10888
+
10750
10889
  }
10751
10890
  }
10752
10891
  }
10753
10892
  }
10754
10893
 
10755
-
10756
10894
  static void ggml_compute_forward_alibi_f16(
10757
10895
  const struct ggml_compute_params * params,
10758
10896
  const struct ggml_tensor * src0,
@@ -10760,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
10760
10898
  struct ggml_tensor * dst) {
10761
10899
  assert(params->ith == 0);
10762
10900
  assert(src1->type == GGML_TYPE_I32);
10763
- assert(ggml_nelements(src1) == 2);
10901
+ assert(ggml_nelements(src1) == 3);
10764
10902
 
10765
10903
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10766
10904
  return;
10767
10905
  }
10768
10906
 
10769
- const int n_past = ((int32_t *) src1->data)[0];
10770
- const int n_head = ((int32_t *) src1->data)[1];
10907
+ const int n_past = ((int32_t *) src1->data)[0];
10908
+ const int n_head = ((int32_t *) src1->data)[1];
10909
+ const float max_bias = ((float *) src1->data)[2];
10771
10910
 
10772
10911
  assert(n_past >= 0);
10773
10912
 
@@ -10790,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
10790
10929
  // add alibi to src0 (KQ_scaled)
10791
10930
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
10792
10931
 
10793
- const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
10794
- const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
10932
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
10933
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
10795
10934
 
10796
10935
  for (int i = 0; i < ne0; i++) {
10797
10936
  for (int j = 0; j < ne1; j++) {
@@ -10810,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
10810
10949
  }
10811
10950
 
10812
10951
  // we return F32
10813
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
10952
+ pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
10814
10953
  }
10815
10954
  }
10816
10955
  }
@@ -10846,6 +10985,77 @@ static void ggml_compute_forward_alibi(
10846
10985
  }
10847
10986
  }
10848
10987
 
10988
+
10989
+ // ggml_compute_forward_clamp
10990
+
10991
+ static void ggml_compute_forward_clamp_f32(
10992
+ const struct ggml_compute_params * params,
10993
+ const struct ggml_tensor * src0,
10994
+ const struct ggml_tensor * src1,
10995
+ struct ggml_tensor * dst) {
10996
+ assert(params->ith == 0);
10997
+ assert(src1->type == GGML_TYPE_I32);
10998
+ assert(ggml_nelements(src1) == 2);
10999
+
11000
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11001
+ return;
11002
+ }
11003
+
11004
+ const int min = ((float *) src1->data)[0];
11005
+ const int max = ((float *) src1->data)[1];
11006
+
11007
+ const int ith = params->ith;
11008
+ const int nth = params->nth;
11009
+
11010
+ const int n = ggml_nrows(src0);
11011
+ const int nc = src0->ne[0];
11012
+
11013
+ const size_t nb00 = src0->nb[0];
11014
+ const size_t nb01 = src0->nb[1];
11015
+
11016
+ const size_t nb0 = dst->nb[0];
11017
+ const size_t nb1 = dst->nb[1];
11018
+
11019
+ GGML_ASSERT( nb0 == sizeof(float));
11020
+ GGML_ASSERT(nb00 == sizeof(float));
11021
+
11022
+ for (int j = ith; j < n; j += nth) {
11023
+ float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
11024
+ float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
11025
+
11026
+ for (int i = 0; i < nc; i++) {
11027
+ dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
11028
+ }
11029
+ }
11030
+ }
11031
+
11032
+ static void ggml_compute_forward_clamp(
11033
+ const struct ggml_compute_params * params,
11034
+ const struct ggml_tensor * src0,
11035
+ const struct ggml_tensor * src1,
11036
+ struct ggml_tensor * dst) {
11037
+ switch (src0->type) {
11038
+ case GGML_TYPE_F32:
11039
+ {
11040
+ ggml_compute_forward_clamp_f32(params, src0, src1, dst);
11041
+ } break;
11042
+ case GGML_TYPE_F16:
11043
+ case GGML_TYPE_Q4_0:
11044
+ case GGML_TYPE_Q4_1:
11045
+ case GGML_TYPE_Q5_0:
11046
+ case GGML_TYPE_Q5_1:
11047
+ case GGML_TYPE_Q8_0:
11048
+ case GGML_TYPE_Q8_1:
11049
+ case GGML_TYPE_I8:
11050
+ case GGML_TYPE_I16:
11051
+ case GGML_TYPE_I32:
11052
+ case GGML_TYPE_COUNT:
11053
+ {
11054
+ GGML_ASSERT(false);
11055
+ } break;
11056
+ }
11057
+ }
11058
+
10849
11059
  // ggml_compute_forward_rope
10850
11060
 
10851
11061
  static void ggml_compute_forward_rope_f32(
@@ -12827,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
12827
13037
  {
12828
13038
  ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
12829
13039
  } break;
13040
+ case GGML_OP_CLAMP:
13041
+ {
13042
+ ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
13043
+ } break;
12830
13044
  case GGML_OP_CONV_1D_1S:
12831
13045
  {
12832
13046
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -13134,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
13134
13348
  {
13135
13349
  GGML_ASSERT(false); // TODO: not implemented
13136
13350
  } break;
13351
+ case GGML_OP_CLAMP:
13352
+ {
13353
+ GGML_ASSERT(false); // TODO: not implemented
13354
+ } break;
13137
13355
  case GGML_OP_SILU:
13138
13356
  {
13139
13357
  // necessary for llama
@@ -14013,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14013
14231
  {
14014
14232
  node->n_tasks = 1; //TODO
14015
14233
  } break;
14234
+ case GGML_OP_CLAMP:
14235
+ {
14236
+ node->n_tasks = 1; //TODO
14237
+ } break;
14016
14238
  case GGML_OP_CONV_1D_1S:
14017
14239
  case GGML_OP_CONV_1D_2S:
14018
14240
  {
@@ -14409,9 +14631,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
14409
14631
  fprintf(fp, "%s |", node->name);
14410
14632
  }
14411
14633
 
14412
- fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
14413
- i, node->ne[0], node->ne[1],
14414
- GGML_OP_SYMBOL[node->op]);
14634
+ if (node->n_dims == 2) {
14635
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
14636
+ } else {
14637
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
14638
+ }
14639
+
14415
14640
 
14416
14641
  if (node->grad) {
14417
14642
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);