llama_cpp 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +246 -133
- data/ext/llama_cpp/src/ggml.c +362 -137
- data/ext/llama_cpp/src/ggml.h +13 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +173 -102
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
512
512
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
513
513
|
}
|
514
514
|
|
515
|
-
#if __AVX2__ || __AVX512F__
|
515
|
+
#if defined(__AVX2__) || defined(__AVX512F__)
|
516
516
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
517
517
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
518
518
|
uint32_t x32;
|
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
688
688
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
689
689
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
690
690
|
|
691
|
-
#if __ARM_NEON
|
691
|
+
#if defined(__ARM_NEON)
|
692
692
|
|
693
693
|
#if !defined(__aarch64__)
|
694
694
|
|
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
740
740
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
741
741
|
}
|
742
742
|
|
743
|
-
float vminvq_f32(float32x4_t v) {
|
743
|
+
inline static float vminvq_f32(float32x4_t v) {
|
744
744
|
return
|
745
745
|
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
746
746
|
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
747
747
|
}
|
748
748
|
|
749
|
-
float vmaxvq_f32(float32x4_t v) {
|
749
|
+
inline static float vmaxvq_f32(float32x4_t v) {
|
750
750
|
return
|
751
751
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
752
752
|
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
753
753
|
}
|
754
754
|
|
755
|
-
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
755
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
756
756
|
int32x4_t res;
|
757
757
|
|
758
758
|
res[0] = roundf(vgetq_lane_f32(v, 0));
|
@@ -766,21 +766,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
766
766
|
#endif
|
767
767
|
#endif
|
768
768
|
|
769
|
-
|
770
769
|
#define QK4_0 32
|
771
770
|
typedef struct {
|
772
|
-
|
771
|
+
ggml_fp16_t d; // delta
|
773
772
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
774
773
|
} block_q4_0;
|
775
|
-
static_assert(sizeof(block_q4_0) == sizeof(
|
774
|
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
776
775
|
|
777
776
|
#define QK4_1 32
|
778
777
|
typedef struct {
|
779
|
-
|
780
|
-
|
778
|
+
ggml_fp16_t d; // delta
|
779
|
+
ggml_fp16_t m; // min
|
781
780
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
782
781
|
} block_q4_1;
|
783
|
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(
|
782
|
+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
784
783
|
|
785
784
|
#define QK5_0 32
|
786
785
|
typedef struct {
|
@@ -801,16 +800,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
801
800
|
|
802
801
|
#define QK8_0 32
|
803
802
|
typedef struct {
|
804
|
-
|
805
|
-
int8_t qs[QK8_0];
|
803
|
+
ggml_fp16_t d; // delta
|
804
|
+
int8_t qs[QK8_0]; // quants
|
806
805
|
} block_q8_0;
|
807
|
-
static_assert(sizeof(block_q8_0) == sizeof(
|
806
|
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
808
807
|
|
809
808
|
#define QK8_1 32
|
810
809
|
typedef struct {
|
811
|
-
float
|
812
|
-
float
|
813
|
-
int8_t qs[QK8_1];
|
810
|
+
float d; // delta
|
811
|
+
float s; // d * sum(qs[i])
|
812
|
+
int8_t qs[QK8_1]; // quants
|
814
813
|
} block_q8_1;
|
815
814
|
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
816
815
|
|
@@ -837,7 +836,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
837
836
|
const float d = max / -8;
|
838
837
|
const float id = d ? 1.0f/d : 0.0f;
|
839
838
|
|
840
|
-
y[i].d = d;
|
839
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
841
840
|
|
842
841
|
for (int j = 0; j < qk/2; ++j) {
|
843
842
|
const float x0 = x[i*qk + 0 + j]*id;
|
@@ -877,8 +876,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
|
|
877
876
|
const float d = (max - min) / ((1 << 4) - 1);
|
878
877
|
const float id = d ? 1.0f/d : 0.0f;
|
879
878
|
|
880
|
-
y[i].d = d;
|
881
|
-
y[i].m = min;
|
879
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
880
|
+
y[i].m = GGML_FP32_TO_FP16(min);
|
882
881
|
|
883
882
|
for (int j = 0; j < qk/2; ++j) {
|
884
883
|
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
@@ -1009,7 +1008,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
1009
1008
|
const float d = amax / ((1 << 7) - 1);
|
1010
1009
|
const float id = d ? 1.0f/d : 0.0f;
|
1011
1010
|
|
1012
|
-
y[i].d = d;
|
1011
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1013
1012
|
|
1014
1013
|
for (int j = 0; j < QK8_0; ++j) {
|
1015
1014
|
const float x0 = x[i*QK8_0 + j]*id;
|
@@ -1044,7 +1043,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1044
1043
|
const float d = amax / ((1 << 7) - 1);
|
1045
1044
|
const float id = d ? 1.0f/d : 0.0f;
|
1046
1045
|
|
1047
|
-
y[i].d = d;
|
1046
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1048
1047
|
|
1049
1048
|
for (int j = 0; j < 8; j++) {
|
1050
1049
|
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1056
1055
|
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
1057
1056
|
}
|
1058
1057
|
}
|
1058
|
+
#elif defined(__wasm_simd128__)
|
1059
|
+
for (int i = 0; i < nb; i++) {
|
1060
|
+
v128_t srcv [8];
|
1061
|
+
v128_t asrcv[8];
|
1062
|
+
v128_t amaxv[8];
|
1063
|
+
|
1064
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1065
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1066
|
+
|
1067
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1068
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1069
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1070
|
+
|
1071
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1072
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1073
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1074
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1075
|
+
|
1076
|
+
const float d = amax / ((1 << 7) - 1);
|
1077
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1078
|
+
|
1079
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1080
|
+
|
1081
|
+
for (int j = 0; j < 8; j++) {
|
1082
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1083
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1084
|
+
|
1085
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1086
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1087
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1088
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1089
|
+
}
|
1090
|
+
}
|
1059
1091
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1060
1092
|
for (int i = 0; i < nb; i++) {
|
1061
1093
|
// Load elements into 4 AVX vectors
|
@@ -1079,7 +1111,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1079
1111
|
|
1080
1112
|
// Quantize these floats
|
1081
1113
|
const float d = maxScalar / 127.f;
|
1082
|
-
y[i].d = d;
|
1114
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1083
1115
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1084
1116
|
const __m256 mul = _mm256_set1_ps( id );
|
1085
1117
|
|
@@ -1178,7 +1210,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
|
|
1178
1210
|
sum += y[i].qs[QK8_1/2 + j];
|
1179
1211
|
}
|
1180
1212
|
|
1181
|
-
y[i].s = d
|
1213
|
+
y[i].s = sum*d;
|
1182
1214
|
}
|
1183
1215
|
}
|
1184
1216
|
|
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
|
1224
1256
|
|
1225
1257
|
y[i].s = d * vaddvq_s32(accv);
|
1226
1258
|
}
|
1259
|
+
#elif defined(__wasm_simd128__)
|
1260
|
+
for (int i = 0; i < nb; i++) {
|
1261
|
+
v128_t srcv [8];
|
1262
|
+
v128_t asrcv[8];
|
1263
|
+
v128_t amaxv[8];
|
1264
|
+
|
1265
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1266
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1267
|
+
|
1268
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1269
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1270
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1271
|
+
|
1272
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1273
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1274
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1275
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1276
|
+
|
1277
|
+
const float d = amax / ((1 << 7) - 1);
|
1278
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1279
|
+
|
1280
|
+
y[i].d = d;
|
1281
|
+
|
1282
|
+
v128_t accv = wasm_i32x4_splat(0);
|
1283
|
+
|
1284
|
+
for (int j = 0; j < 8; j++) {
|
1285
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1286
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1287
|
+
|
1288
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1289
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1290
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1291
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1292
|
+
|
1293
|
+
accv = wasm_i32x4_add(accv, vi);
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
1297
|
+
wasm_i32x4_extract_lane(accv, 1) +
|
1298
|
+
wasm_i32x4_extract_lane(accv, 2) +
|
1299
|
+
wasm_i32x4_extract_lane(accv, 3));
|
1300
|
+
}
|
1227
1301
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1228
1302
|
for (int i = 0; i < nb; i++) {
|
1229
1303
|
// Load elements into 4 AVX vectors
|
@@ -1330,7 +1404,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
|
|
1330
1404
|
const int nb = k / qk;
|
1331
1405
|
|
1332
1406
|
for (int i = 0; i < nb; i++) {
|
1333
|
-
const float d = x[i].d;
|
1407
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1334
1408
|
|
1335
1409
|
for (int j = 0; j < qk/2; ++j) {
|
1336
1410
|
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
@@ -1350,8 +1424,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
|
|
1350
1424
|
const int nb = k / qk;
|
1351
1425
|
|
1352
1426
|
for (int i = 0; i < nb; i++) {
|
1353
|
-
const float d = x[i].d;
|
1354
|
-
const float m = x[i].m;
|
1427
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1428
|
+
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1355
1429
|
|
1356
1430
|
for (int j = 0; j < qk/2; ++j) {
|
1357
1431
|
const int x0 = (x[i].qs[j] & 0x0F);
|
@@ -1426,7 +1500,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
1426
1500
|
const block_q8_0 * restrict x = vx;
|
1427
1501
|
|
1428
1502
|
for (int i = 0; i < nb; i++) {
|
1429
|
-
const float d = x[i].d;
|
1503
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1430
1504
|
|
1431
1505
|
for (int j = 0; j < qk; ++j) {
|
1432
1506
|
y[i*qk + j] = x[i].qs[j]*d;
|
@@ -1690,8 +1764,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1690
1764
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
1691
1765
|
float tmp[8];
|
1692
1766
|
|
1693
|
-
for (int i = 0; i < 8; i++)
|
1767
|
+
for (int i = 0; i < 8; i++) {
|
1694
1768
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
1769
|
+
}
|
1695
1770
|
|
1696
1771
|
return _mm256_loadu_ps(tmp);
|
1697
1772
|
}
|
@@ -2111,8 +2186,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2111
2186
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
2112
2187
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
2113
2188
|
|
2114
|
-
const uint8x16_t m4b
|
2115
|
-
const int8x16_t s8b
|
2189
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2190
|
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
2116
2191
|
|
2117
2192
|
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2118
2193
|
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
@@ -2140,8 +2215,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2140
2215
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
2141
2216
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
2142
2217
|
|
2143
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2144
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2218
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2219
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2145
2220
|
#else
|
2146
2221
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
2147
2222
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
@@ -2158,8 +2233,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2158
2233
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2159
2234
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2160
2235
|
|
2161
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2162
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2236
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2237
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2163
2238
|
#endif
|
2164
2239
|
}
|
2165
2240
|
|
@@ -2171,7 +2246,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2171
2246
|
// Main loop
|
2172
2247
|
for (int i = 0; i < nb; ++i) {
|
2173
2248
|
/* Compute combined scale for the block */
|
2174
|
-
const __m256 d =
|
2249
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2175
2250
|
|
2176
2251
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2177
2252
|
|
@@ -2195,7 +2270,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2195
2270
|
// Main loop
|
2196
2271
|
for (int i = 0; i < nb; ++i) {
|
2197
2272
|
// Compute combined scale for the block
|
2198
|
-
const __m256 d =
|
2273
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2199
2274
|
|
2200
2275
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
2201
2276
|
const __m128i off = _mm_set1_epi8(8);
|
@@ -2237,7 +2312,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2237
2312
|
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
2238
2313
|
|
2239
2314
|
// Compute combined scale for the block 0 and 1
|
2240
|
-
const __m128 d_0_1 =
|
2315
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
|
2241
2316
|
|
2242
2317
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
2243
2318
|
|
@@ -2255,7 +2330,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2255
2330
|
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
2256
2331
|
|
2257
2332
|
// Compute combined scale for the block 2 and 3
|
2258
|
-
const __m128 d_2_3 =
|
2333
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
|
2259
2334
|
|
2260
2335
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
2261
2336
|
|
@@ -2288,7 +2363,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2288
2363
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
2289
2364
|
|
2290
2365
|
// Compute combined scale for the block 0 and 1
|
2291
|
-
const __m128 d_0_1 =
|
2366
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2292
2367
|
|
2293
2368
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
2294
2369
|
|
@@ -2306,7 +2381,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2306
2381
|
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
2307
2382
|
|
2308
2383
|
// Compute combined scale for the block 2 and 3
|
2309
|
-
const __m128 d_2_3 =
|
2384
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
|
2310
2385
|
|
2311
2386
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
2312
2387
|
|
@@ -2354,7 +2429,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2354
2429
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2355
2430
|
}
|
2356
2431
|
|
2357
|
-
sumf += (x[i].d*y[i].d)
|
2432
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2358
2433
|
}
|
2359
2434
|
|
2360
2435
|
*s = sumf;
|
@@ -2384,7 +2459,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2384
2459
|
const block_q8_1 * restrict y0 = &y[i + 0];
|
2385
2460
|
const block_q8_1 * restrict y1 = &y[i + 1];
|
2386
2461
|
|
2387
|
-
summs += x0->m * y0->s + x1->m * y1->s;
|
2462
|
+
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
2388
2463
|
|
2389
2464
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2390
2465
|
|
@@ -2408,8 +2483,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2408
2483
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
2409
2484
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
2410
2485
|
|
2411
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2412
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2486
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2487
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2413
2488
|
#else
|
2414
2489
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
2415
2490
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
@@ -2426,8 +2501,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2426
2501
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2427
2502
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2428
2503
|
|
2429
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2430
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2504
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2505
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2431
2506
|
#endif
|
2432
2507
|
}
|
2433
2508
|
|
@@ -2440,13 +2515,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2440
2515
|
|
2441
2516
|
// Main loop
|
2442
2517
|
for (int i = 0; i < nb; ++i) {
|
2443
|
-
const float
|
2444
|
-
const float
|
2518
|
+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
2519
|
+
const float d1 = y[i].d;
|
2445
2520
|
|
2446
|
-
summs += x[i].m * y[i].s;
|
2521
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
2447
2522
|
|
2448
|
-
const __m256 d0v =
|
2449
|
-
const __m256 d1v =
|
2523
|
+
const __m256 d0v = _mm256_set1_ps( d0 );
|
2524
|
+
const __m256 d1v = _mm256_set1_ps( d1 );
|
2450
2525
|
|
2451
2526
|
// Compute combined scales
|
2452
2527
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
@@ -2480,7 +2555,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2480
2555
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2481
2556
|
}
|
2482
2557
|
|
2483
|
-
sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
|
2558
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2484
2559
|
}
|
2485
2560
|
|
2486
2561
|
*s = sumf;
|
@@ -2556,16 +2631,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2556
2631
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2557
2632
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2558
2633
|
|
2559
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2560
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2561
|
-
|
2562
2634
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2563
2635
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2564
2636
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2565
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2637
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2566
2638
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2567
2639
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2568
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2640
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2569
2641
|
#else
|
2570
2642
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2571
2643
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2582,8 +2654,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2582
2654
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2583
2655
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2584
2656
|
|
2585
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2586
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2657
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2658
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2587
2659
|
#endif
|
2588
2660
|
}
|
2589
2661
|
|
@@ -2600,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2600
2672
|
const block_q8_0 * restrict y0 = &y[i];
|
2601
2673
|
|
2602
2674
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
2603
|
-
const v128_t s16b = wasm_i8x16_splat(0x10);
|
2604
2675
|
|
2605
2676
|
// extract the 5th bit
|
2606
2677
|
memcpy(&qh, x0->qh, sizeof(qh));
|
@@ -2638,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2638
2709
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2639
2710
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2640
2711
|
|
2641
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2642
|
-
|
2643
2712
|
// dot product
|
2644
2713
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
2645
2714
|
wasm_i32x4_add(
|
2646
2715
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2647
2716
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2648
2717
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2649
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2718
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2719
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
2650
2720
|
}
|
2651
2721
|
|
2652
2722
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2658,7 +2728,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2658
2728
|
// Main loop
|
2659
2729
|
for (int i = 0; i < nb; i++) {
|
2660
2730
|
/* Compute combined scale for the block */
|
2661
|
-
const __m256 d =
|
2731
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2662
2732
|
|
2663
2733
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2664
2734
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2682,7 +2752,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2682
2752
|
// Main loop
|
2683
2753
|
for (int i = 0; i < nb; i++) {
|
2684
2754
|
/* Compute combined scale for the block */
|
2685
|
-
const __m256 d =
|
2755
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2686
2756
|
|
2687
2757
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2688
2758
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2725,7 +2795,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2725
2795
|
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
2726
2796
|
}
|
2727
2797
|
|
2728
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
|
2798
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
2729
2799
|
}
|
2730
2800
|
|
2731
2801
|
*s = sumf;
|
@@ -2807,16 +2877,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2807
2877
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2808
2878
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2809
2879
|
|
2810
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2811
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2812
|
-
|
2813
2880
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2814
2881
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2815
2882
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2816
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2883
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2817
2884
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2818
2885
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2819
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2886
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2820
2887
|
#else
|
2821
2888
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2822
2889
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2833,8 +2900,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2833
2900
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2834
2901
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2835
2902
|
|
2836
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2837
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2903
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2904
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2838
2905
|
#endif
|
2839
2906
|
}
|
2840
2907
|
|
@@ -2873,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2873
2940
|
const v128_t v0l = wasm_v128_and (v0, m4b);
|
2874
2941
|
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
2875
2942
|
|
2876
|
-
static bool x = true;
|
2877
|
-
|
2878
2943
|
// add high bit
|
2879
2944
|
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
2880
2945
|
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
@@ -2894,15 +2959,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2894
2959
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2895
2960
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2896
2961
|
|
2897
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2898
|
-
|
2899
2962
|
// dot product
|
2900
|
-
sumv = wasm_f32x4_add(sumv,
|
2901
|
-
|
2963
|
+
sumv = wasm_f32x4_add(sumv,
|
2964
|
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
2902
2965
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2903
2966
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2904
2967
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2905
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2968
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2969
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
2906
2970
|
}
|
2907
2971
|
|
2908
2972
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2924,7 +2988,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2924
2988
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
2925
2989
|
bx = _mm256_or_si256(bx, bxhi);
|
2926
2990
|
|
2927
|
-
const __m256 dy =
|
2991
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2928
2992
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2929
2993
|
|
2930
2994
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -2958,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2958
3022
|
bxh = _mm_or_si128(bxh, bxhih);
|
2959
3023
|
bx = _mm256_set_m128i(bxh, bxl);
|
2960
3024
|
|
2961
|
-
const __m256 dy =
|
3025
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2962
3026
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2963
3027
|
|
2964
3028
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -3028,11 +3092,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3028
3092
|
#if defined(__ARM_FEATURE_DOTPROD)
|
3029
3093
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
3030
3094
|
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
3031
|
-
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
|
3095
|
+
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3032
3096
|
|
3033
3097
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
3034
3098
|
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
3035
|
-
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
|
3099
|
+
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3036
3100
|
|
3037
3101
|
#else
|
3038
3102
|
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
@@ -3050,8 +3114,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3050
3114
|
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
3051
3115
|
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
3052
3116
|
|
3053
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
|
3054
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
|
3117
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3118
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3055
3119
|
#endif
|
3056
3120
|
}
|
3057
3121
|
|
@@ -3063,7 +3127,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3063
3127
|
// Main loop
|
3064
3128
|
for (int i = 0; i < nb; ++i) {
|
3065
3129
|
// Compute combined scale for the block
|
3066
|
-
const __m256 d =
|
3130
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
3067
3131
|
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
3068
3132
|
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3069
3133
|
|
@@ -3089,7 +3153,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3089
3153
|
sumi += x[i].qs[j]*y[i].qs[j];
|
3090
3154
|
}
|
3091
3155
|
|
3092
|
-
sumf += (x[i].d*y[i].d)
|
3156
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3093
3157
|
}
|
3094
3158
|
|
3095
3159
|
*s = sumf;
|
@@ -3478,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3478
3542
|
"ROPE",
|
3479
3543
|
"ROPE_BACK",
|
3480
3544
|
"ALIBI",
|
3545
|
+
"CLAMP",
|
3481
3546
|
"CONV_1D_1S",
|
3482
3547
|
"CONV_1D_2S",
|
3483
3548
|
|
@@ -3488,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3488
3553
|
"MAP_BINARY",
|
3489
3554
|
};
|
3490
3555
|
|
3491
|
-
static_assert(GGML_OP_COUNT ==
|
3556
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3557
|
+
|
3492
3558
|
|
3493
3559
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3494
3560
|
"none",
|
@@ -3538,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3538
3604
|
"rope(x)",
|
3539
3605
|
"rope_back(x)",
|
3540
3606
|
"alibi(x)",
|
3607
|
+
"clamp(x)",
|
3541
3608
|
"conv_1d_1s(x)",
|
3542
3609
|
"conv_1d_2s(x)",
|
3543
3610
|
|
@@ -3548,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3548
3615
|
"f(x,y)",
|
3549
3616
|
};
|
3550
3617
|
|
3551
|
-
static_assert(GGML_OP_COUNT ==
|
3618
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3552
3619
|
|
3553
3620
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3554
3621
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3782,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
|
|
3782
3849
|
(t1->ne[3]%t0->ne[3] == 0);
|
3783
3850
|
}
|
3784
3851
|
|
3852
|
+
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3853
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3854
|
+
|
3855
|
+
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
|
3856
|
+
}
|
3857
|
+
|
3785
3858
|
static inline int ggml_up32(int n) {
|
3786
3859
|
return (n + 31) & ~31;
|
3787
3860
|
}
|
@@ -4664,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
|
|
4664
4737
|
struct ggml_tensor * a,
|
4665
4738
|
struct ggml_tensor * b,
|
4666
4739
|
bool inplace) {
|
4667
|
-
|
4740
|
+
// TODO: support less-strict constraint
|
4741
|
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
4742
|
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
4668
4743
|
|
4669
4744
|
bool is_node = false;
|
4670
4745
|
|
4671
4746
|
if (!inplace && (a->grad || b->grad)) {
|
4747
|
+
// TODO: support backward pass for broadcasting
|
4748
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
4672
4749
|
is_node = true;
|
4673
4750
|
}
|
4674
4751
|
|
@@ -6210,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6210
6287
|
struct ggml_context * ctx,
|
6211
6288
|
struct ggml_tensor * a,
|
6212
6289
|
int n_past,
|
6213
|
-
int n_head
|
6290
|
+
int n_head,
|
6291
|
+
float bias_max) {
|
6214
6292
|
GGML_ASSERT(n_past >= 0);
|
6215
6293
|
bool is_node = false;
|
6216
6294
|
|
@@ -6229,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6229
6307
|
|
6230
6308
|
((int32_t *) b->data)[0] = n_past;
|
6231
6309
|
((int32_t *) b->data)[1] = n_head;
|
6310
|
+
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
|
6311
|
+
(((float *) b->data)[2]) = bias_max;
|
6232
6312
|
|
6233
6313
|
ggml_scratch_load(ctx);
|
6234
6314
|
|
@@ -6240,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
|
|
6240
6320
|
return result;
|
6241
6321
|
}
|
6242
6322
|
|
6323
|
+
// ggml_clamp
|
6324
|
+
|
6325
|
+
struct ggml_tensor * ggml_clamp(
|
6326
|
+
struct ggml_context * ctx,
|
6327
|
+
struct ggml_tensor * a,
|
6328
|
+
float min,
|
6329
|
+
float max) {
|
6330
|
+
bool is_node = false;
|
6331
|
+
|
6332
|
+
if (a->grad) {
|
6333
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6334
|
+
is_node = true;
|
6335
|
+
}
|
6336
|
+
|
6337
|
+
// TODO: when implement backward, fix this:
|
6338
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6339
|
+
|
6340
|
+
ggml_scratch_save(ctx);
|
6341
|
+
|
6342
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6343
|
+
|
6344
|
+
((float *) b->data)[0] = min;
|
6345
|
+
((float *) b->data)[1] = max;
|
6346
|
+
|
6347
|
+
ggml_scratch_load(ctx);
|
6348
|
+
|
6349
|
+
result->op = GGML_OP_CLAMP;
|
6350
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6351
|
+
result->src0 = a;
|
6352
|
+
result->src1 = b;
|
6353
|
+
|
6354
|
+
return result;
|
6355
|
+
}
|
6356
|
+
|
6243
6357
|
// ggml_conv_1d_1s
|
6244
6358
|
|
6245
6359
|
struct ggml_tensor * ggml_conv_1d_1s(
|
@@ -7966,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7966
8080
|
const struct ggml_tensor * src0,
|
7967
8081
|
const struct ggml_tensor * src1,
|
7968
8082
|
struct ggml_tensor * dst) {
|
7969
|
-
|
8083
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
7970
8084
|
|
7971
8085
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7972
8086
|
return;
|
@@ -7974,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
|
|
7974
8088
|
const int ith = params->ith;
|
7975
8089
|
const int nth = params->nth;
|
7976
8090
|
|
7977
|
-
|
7978
|
-
|
7979
|
-
|
7980
|
-
|
8091
|
+
#ifdef GGML_USE_CUBLAS
|
8092
|
+
if (src1->backend == GGML_BACKEND_CUDA) {
|
8093
|
+
if (ith == 0) {
|
8094
|
+
ggml_cuda_mul(src0, src1, dst);
|
8095
|
+
}
|
8096
|
+
return;
|
8097
|
+
}
|
8098
|
+
#endif
|
8099
|
+
|
8100
|
+
const int64_t nr = ggml_nrows(src0);
|
8101
|
+
|
8102
|
+
const int64_t ne00 = src0->ne[0];
|
8103
|
+
const int64_t ne01 = src0->ne[1];
|
8104
|
+
const int64_t ne02 = src0->ne[2];
|
8105
|
+
|
8106
|
+
const int64_t ne10 = src1->ne[0];
|
8107
|
+
const int64_t ne11 = src1->ne[1];
|
8108
|
+
const int64_t ne12 = src1->ne[2];
|
8109
|
+
const int64_t ne13 = src1->ne[3];
|
7981
8110
|
|
7982
8111
|
const size_t nb00 = src0->nb[0];
|
7983
8112
|
const size_t nb01 = src0->nb[1];
|
@@ -7996,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
|
|
7996
8125
|
|
7997
8126
|
GGML_ASSERT( nb0 == sizeof(float));
|
7998
8127
|
GGML_ASSERT(nb00 == sizeof(float));
|
8128
|
+
GGML_ASSERT(ne00 == ne10);
|
7999
8129
|
|
8000
8130
|
if (nb10 == sizeof(float)) {
|
8001
|
-
for (
|
8002
|
-
// src0
|
8003
|
-
const
|
8004
|
-
const
|
8005
|
-
const
|
8131
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8132
|
+
// src0 and dst are same shape => same indices
|
8133
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8134
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8135
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8136
|
+
|
8137
|
+
const int64_t i13 = i03 % ne13;
|
8138
|
+
const int64_t i12 = i02 % ne12;
|
8139
|
+
const int64_t i11 = i01 % ne11;
|
8006
8140
|
|
8141
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8142
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8143
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
8007
8144
|
|
8008
8145
|
#ifdef GGML_USE_ACCELERATE
|
8009
8146
|
UNUSED(ggml_vec_mul_f32);
|
8010
8147
|
|
8011
|
-
vDSP_vmul(
|
8012
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
8013
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
8014
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
8015
|
-
ne0);
|
8148
|
+
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
8016
8149
|
#else
|
8017
|
-
ggml_vec_mul_f32(
|
8018
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
8019
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
8020
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
8150
|
+
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
8021
8151
|
#endif
|
8022
8152
|
// }
|
8023
8153
|
// }
|
8024
8154
|
}
|
8025
8155
|
} else {
|
8026
8156
|
// src1 is not contiguous
|
8027
|
-
for (
|
8028
|
-
// src0
|
8029
|
-
|
8030
|
-
const
|
8031
|
-
const
|
8157
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8158
|
+
// src0 and dst are same shape => same indices
|
8159
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8160
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8161
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8162
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8032
8163
|
|
8033
|
-
|
8034
|
-
|
8035
|
-
|
8036
|
-
|
8164
|
+
const int64_t i13 = i03 % ne13;
|
8165
|
+
const int64_t i12 = i02 % ne12;
|
8166
|
+
const int64_t i11 = i01 % ne11;
|
8167
|
+
|
8168
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8169
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8170
|
+
|
8171
|
+
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
8172
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
8037
8173
|
|
8038
8174
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
8039
8175
|
}
|
@@ -10527,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
10527
10663
|
|
10528
10664
|
const int n_past = ((int32_t *) src1->data)[0];
|
10529
10665
|
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
10666
|
+
|
10530
10667
|
assert(n_past >= 0);
|
10531
10668
|
|
10532
10669
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
@@ -10697,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
|
|
10697
10834
|
struct ggml_tensor * dst) {
|
10698
10835
|
assert(params->ith == 0);
|
10699
10836
|
assert(src1->type == GGML_TYPE_I32);
|
10700
|
-
assert(ggml_nelements(src1) ==
|
10837
|
+
assert(ggml_nelements(src1) == 3);
|
10701
10838
|
|
10702
10839
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10703
10840
|
return;
|
10704
10841
|
}
|
10705
10842
|
|
10706
|
-
const int
|
10707
|
-
const int
|
10843
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10844
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10845
|
+
const float max_bias = ((float *) src1->data)[2];
|
10708
10846
|
|
10709
10847
|
assert(n_past >= 0);
|
10710
10848
|
|
@@ -10727,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
|
|
10727
10865
|
// add alibi to src0 (KQ_scaled)
|
10728
10866
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10729
10867
|
|
10730
|
-
const float m0 = powf(2.0f, -
|
10731
|
-
const float m1 = powf(2.0f, -
|
10868
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10869
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10732
10870
|
|
10733
10871
|
for (int i = 0; i < ne0; i++) {
|
10734
10872
|
for (int j = 0; j < ne1; j++) {
|
@@ -10746,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
|
|
10746
10884
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
10747
10885
|
}
|
10748
10886
|
|
10749
|
-
pdst[0] = i * m_k + src[0];
|
10887
|
+
pdst[0] = (i-ne0+1) * m_k + src[0];
|
10888
|
+
|
10750
10889
|
}
|
10751
10890
|
}
|
10752
10891
|
}
|
10753
10892
|
}
|
10754
10893
|
|
10755
|
-
|
10756
10894
|
static void ggml_compute_forward_alibi_f16(
|
10757
10895
|
const struct ggml_compute_params * params,
|
10758
10896
|
const struct ggml_tensor * src0,
|
@@ -10760,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
|
|
10760
10898
|
struct ggml_tensor * dst) {
|
10761
10899
|
assert(params->ith == 0);
|
10762
10900
|
assert(src1->type == GGML_TYPE_I32);
|
10763
|
-
assert(ggml_nelements(src1) ==
|
10901
|
+
assert(ggml_nelements(src1) == 3);
|
10764
10902
|
|
10765
10903
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10766
10904
|
return;
|
10767
10905
|
}
|
10768
10906
|
|
10769
|
-
const int
|
10770
|
-
const int
|
10907
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10908
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10909
|
+
const float max_bias = ((float *) src1->data)[2];
|
10771
10910
|
|
10772
10911
|
assert(n_past >= 0);
|
10773
10912
|
|
@@ -10790,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
|
|
10790
10929
|
// add alibi to src0 (KQ_scaled)
|
10791
10930
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10792
10931
|
|
10793
|
-
const float m0 = powf(2.0f, -
|
10794
|
-
const float m1 = powf(2.0f, -
|
10932
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10933
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10795
10934
|
|
10796
10935
|
for (int i = 0; i < ne0; i++) {
|
10797
10936
|
for (int j = 0; j < ne1; j++) {
|
@@ -10810,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
10810
10949
|
}
|
10811
10950
|
|
10812
10951
|
// we return F32
|
10813
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
10952
|
+
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
10814
10953
|
}
|
10815
10954
|
}
|
10816
10955
|
}
|
@@ -10846,6 +10985,77 @@ static void ggml_compute_forward_alibi(
|
|
10846
10985
|
}
|
10847
10986
|
}
|
10848
10987
|
|
10988
|
+
|
10989
|
+
// ggml_compute_forward_clamp
|
10990
|
+
|
10991
|
+
static void ggml_compute_forward_clamp_f32(
|
10992
|
+
const struct ggml_compute_params * params,
|
10993
|
+
const struct ggml_tensor * src0,
|
10994
|
+
const struct ggml_tensor * src1,
|
10995
|
+
struct ggml_tensor * dst) {
|
10996
|
+
assert(params->ith == 0);
|
10997
|
+
assert(src1->type == GGML_TYPE_I32);
|
10998
|
+
assert(ggml_nelements(src1) == 2);
|
10999
|
+
|
11000
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11001
|
+
return;
|
11002
|
+
}
|
11003
|
+
|
11004
|
+
const int min = ((float *) src1->data)[0];
|
11005
|
+
const int max = ((float *) src1->data)[1];
|
11006
|
+
|
11007
|
+
const int ith = params->ith;
|
11008
|
+
const int nth = params->nth;
|
11009
|
+
|
11010
|
+
const int n = ggml_nrows(src0);
|
11011
|
+
const int nc = src0->ne[0];
|
11012
|
+
|
11013
|
+
const size_t nb00 = src0->nb[0];
|
11014
|
+
const size_t nb01 = src0->nb[1];
|
11015
|
+
|
11016
|
+
const size_t nb0 = dst->nb[0];
|
11017
|
+
const size_t nb1 = dst->nb[1];
|
11018
|
+
|
11019
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
11020
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
11021
|
+
|
11022
|
+
for (int j = ith; j < n; j += nth) {
|
11023
|
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
11024
|
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
11025
|
+
|
11026
|
+
for (int i = 0; i < nc; i++) {
|
11027
|
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
11028
|
+
}
|
11029
|
+
}
|
11030
|
+
}
|
11031
|
+
|
11032
|
+
static void ggml_compute_forward_clamp(
|
11033
|
+
const struct ggml_compute_params * params,
|
11034
|
+
const struct ggml_tensor * src0,
|
11035
|
+
const struct ggml_tensor * src1,
|
11036
|
+
struct ggml_tensor * dst) {
|
11037
|
+
switch (src0->type) {
|
11038
|
+
case GGML_TYPE_F32:
|
11039
|
+
{
|
11040
|
+
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
|
11041
|
+
} break;
|
11042
|
+
case GGML_TYPE_F16:
|
11043
|
+
case GGML_TYPE_Q4_0:
|
11044
|
+
case GGML_TYPE_Q4_1:
|
11045
|
+
case GGML_TYPE_Q5_0:
|
11046
|
+
case GGML_TYPE_Q5_1:
|
11047
|
+
case GGML_TYPE_Q8_0:
|
11048
|
+
case GGML_TYPE_Q8_1:
|
11049
|
+
case GGML_TYPE_I8:
|
11050
|
+
case GGML_TYPE_I16:
|
11051
|
+
case GGML_TYPE_I32:
|
11052
|
+
case GGML_TYPE_COUNT:
|
11053
|
+
{
|
11054
|
+
GGML_ASSERT(false);
|
11055
|
+
} break;
|
11056
|
+
}
|
11057
|
+
}
|
11058
|
+
|
10849
11059
|
// ggml_compute_forward_rope
|
10850
11060
|
|
10851
11061
|
static void ggml_compute_forward_rope_f32(
|
@@ -12827,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
12827
13037
|
{
|
12828
13038
|
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
12829
13039
|
} break;
|
13040
|
+
case GGML_OP_CLAMP:
|
13041
|
+
{
|
13042
|
+
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
13043
|
+
} break;
|
12830
13044
|
case GGML_OP_CONV_1D_1S:
|
12831
13045
|
{
|
12832
13046
|
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
@@ -13134,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
13134
13348
|
{
|
13135
13349
|
GGML_ASSERT(false); // TODO: not implemented
|
13136
13350
|
} break;
|
13351
|
+
case GGML_OP_CLAMP:
|
13352
|
+
{
|
13353
|
+
GGML_ASSERT(false); // TODO: not implemented
|
13354
|
+
} break;
|
13137
13355
|
case GGML_OP_SILU:
|
13138
13356
|
{
|
13139
13357
|
// necessary for llama
|
@@ -14013,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14013
14231
|
{
|
14014
14232
|
node->n_tasks = 1; //TODO
|
14015
14233
|
} break;
|
14234
|
+
case GGML_OP_CLAMP:
|
14235
|
+
{
|
14236
|
+
node->n_tasks = 1; //TODO
|
14237
|
+
} break;
|
14016
14238
|
case GGML_OP_CONV_1D_1S:
|
14017
14239
|
case GGML_OP_CONV_1D_2S:
|
14018
14240
|
{
|
@@ -14409,9 +14631,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
14409
14631
|
fprintf(fp, "%s |", node->name);
|
14410
14632
|
}
|
14411
14633
|
|
14412
|
-
|
14413
|
-
|
14414
|
-
|
14634
|
+
if (node->n_dims == 2) {
|
14635
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
|
14636
|
+
} else {
|
14637
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
14638
|
+
}
|
14639
|
+
|
14415
14640
|
|
14416
14641
|
if (node->grad) {
|
14417
14642
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|