llama_cpp 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +8 -2
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +1034 -0
- data/ext/llama_cpp/src/ggml-opencl.h +8 -10
- data/ext/llama_cpp/src/ggml.c +398 -184
- data/ext/llama_cpp/src/ggml.h +14 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +191 -92
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +3 -3
- data/ext/llama_cpp/src/ggml-opencl.c +0 -361
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
512
512
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
513
513
|
}
|
514
514
|
|
515
|
-
#if __AVX2__ || __AVX512F__
|
515
|
+
#if defined(__AVX2__) || defined(__AVX512F__)
|
516
516
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
517
517
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
518
518
|
uint32_t x32;
|
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
688
688
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
689
689
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
690
690
|
|
691
|
-
#if __ARM_NEON
|
691
|
+
#if defined(__ARM_NEON)
|
692
692
|
|
693
693
|
#if !defined(__aarch64__)
|
694
694
|
|
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
740
740
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
741
741
|
}
|
742
742
|
|
743
|
-
float vminvq_f32(float32x4_t v) {
|
743
|
+
inline static float vminvq_f32(float32x4_t v) {
|
744
744
|
return
|
745
745
|
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
746
746
|
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
747
747
|
}
|
748
748
|
|
749
|
-
float vmaxvq_f32(float32x4_t v) {
|
749
|
+
inline static float vmaxvq_f32(float32x4_t v) {
|
750
750
|
return
|
751
751
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
752
752
|
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
753
753
|
}
|
754
754
|
|
755
|
-
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
755
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
756
756
|
int32x4_t res;
|
757
757
|
|
758
758
|
res[0] = roundf(vgetq_lane_f32(v, 0));
|
@@ -766,21 +766,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
766
766
|
#endif
|
767
767
|
#endif
|
768
768
|
|
769
|
-
|
770
769
|
#define QK4_0 32
|
771
770
|
typedef struct {
|
772
|
-
|
771
|
+
ggml_fp16_t d; // delta
|
773
772
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
774
773
|
} block_q4_0;
|
775
|
-
static_assert(sizeof(block_q4_0) == sizeof(
|
774
|
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
776
775
|
|
777
776
|
#define QK4_1 32
|
778
777
|
typedef struct {
|
779
|
-
|
780
|
-
|
778
|
+
ggml_fp16_t d; // delta
|
779
|
+
ggml_fp16_t m; // min
|
781
780
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
782
781
|
} block_q4_1;
|
783
|
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(
|
782
|
+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
784
783
|
|
785
784
|
#define QK5_0 32
|
786
785
|
typedef struct {
|
@@ -801,16 +800,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
801
800
|
|
802
801
|
#define QK8_0 32
|
803
802
|
typedef struct {
|
804
|
-
|
805
|
-
int8_t qs[QK8_0];
|
803
|
+
ggml_fp16_t d; // delta
|
804
|
+
int8_t qs[QK8_0]; // quants
|
806
805
|
} block_q8_0;
|
807
|
-
static_assert(sizeof(block_q8_0) == sizeof(
|
806
|
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
808
807
|
|
809
808
|
#define QK8_1 32
|
810
809
|
typedef struct {
|
811
|
-
float
|
812
|
-
float
|
813
|
-
int8_t qs[QK8_1];
|
810
|
+
float d; // delta
|
811
|
+
float s; // d * sum(qs[i])
|
812
|
+
int8_t qs[QK8_1]; // quants
|
814
813
|
} block_q8_1;
|
815
814
|
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
816
815
|
|
@@ -837,7 +836,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
837
836
|
const float d = max / -8;
|
838
837
|
const float id = d ? 1.0f/d : 0.0f;
|
839
838
|
|
840
|
-
y[i].d = d;
|
839
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
841
840
|
|
842
841
|
for (int j = 0; j < qk/2; ++j) {
|
843
842
|
const float x0 = x[i*qk + 0 + j]*id;
|
@@ -877,8 +876,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
|
|
877
876
|
const float d = (max - min) / ((1 << 4) - 1);
|
878
877
|
const float id = d ? 1.0f/d : 0.0f;
|
879
878
|
|
880
|
-
y[i].d = d;
|
881
|
-
y[i].m = min;
|
879
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
880
|
+
y[i].m = GGML_FP32_TO_FP16(min);
|
882
881
|
|
883
882
|
for (int j = 0; j < qk/2; ++j) {
|
884
883
|
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
@@ -1009,7 +1008,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
1009
1008
|
const float d = amax / ((1 << 7) - 1);
|
1010
1009
|
const float id = d ? 1.0f/d : 0.0f;
|
1011
1010
|
|
1012
|
-
y[i].d = d;
|
1011
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1013
1012
|
|
1014
1013
|
for (int j = 0; j < QK8_0; ++j) {
|
1015
1014
|
const float x0 = x[i*QK8_0 + j]*id;
|
@@ -1044,7 +1043,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1044
1043
|
const float d = amax / ((1 << 7) - 1);
|
1045
1044
|
const float id = d ? 1.0f/d : 0.0f;
|
1046
1045
|
|
1047
|
-
y[i].d = d;
|
1046
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1048
1047
|
|
1049
1048
|
for (int j = 0; j < 8; j++) {
|
1050
1049
|
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1056
1055
|
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
1057
1056
|
}
|
1058
1057
|
}
|
1058
|
+
#elif defined(__wasm_simd128__)
|
1059
|
+
for (int i = 0; i < nb; i++) {
|
1060
|
+
v128_t srcv [8];
|
1061
|
+
v128_t asrcv[8];
|
1062
|
+
v128_t amaxv[8];
|
1063
|
+
|
1064
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1065
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1066
|
+
|
1067
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1068
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1069
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1070
|
+
|
1071
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1072
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1073
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1074
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1075
|
+
|
1076
|
+
const float d = amax / ((1 << 7) - 1);
|
1077
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1078
|
+
|
1079
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1080
|
+
|
1081
|
+
for (int j = 0; j < 8; j++) {
|
1082
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1083
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1084
|
+
|
1085
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1086
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1087
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1088
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1089
|
+
}
|
1090
|
+
}
|
1059
1091
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1060
1092
|
for (int i = 0; i < nb; i++) {
|
1061
1093
|
// Load elements into 4 AVX vectors
|
@@ -1079,7 +1111,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1079
1111
|
|
1080
1112
|
// Quantize these floats
|
1081
1113
|
const float d = maxScalar / 127.f;
|
1082
|
-
y[i].d = d;
|
1114
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1083
1115
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1084
1116
|
const __m256 mul = _mm256_set1_ps( id );
|
1085
1117
|
|
@@ -1178,7 +1210,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
|
|
1178
1210
|
sum += y[i].qs[QK8_1/2 + j];
|
1179
1211
|
}
|
1180
1212
|
|
1181
|
-
y[i].s = d
|
1213
|
+
y[i].s = sum*d;
|
1182
1214
|
}
|
1183
1215
|
}
|
1184
1216
|
|
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
|
1224
1256
|
|
1225
1257
|
y[i].s = d * vaddvq_s32(accv);
|
1226
1258
|
}
|
1259
|
+
#elif defined(__wasm_simd128__)
|
1260
|
+
for (int i = 0; i < nb; i++) {
|
1261
|
+
v128_t srcv [8];
|
1262
|
+
v128_t asrcv[8];
|
1263
|
+
v128_t amaxv[8];
|
1264
|
+
|
1265
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1266
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1267
|
+
|
1268
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1269
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1270
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1271
|
+
|
1272
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1273
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1274
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1275
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1276
|
+
|
1277
|
+
const float d = amax / ((1 << 7) - 1);
|
1278
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1279
|
+
|
1280
|
+
y[i].d = d;
|
1281
|
+
|
1282
|
+
v128_t accv = wasm_i32x4_splat(0);
|
1283
|
+
|
1284
|
+
for (int j = 0; j < 8; j++) {
|
1285
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1286
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1287
|
+
|
1288
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1289
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1290
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1291
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1292
|
+
|
1293
|
+
accv = wasm_i32x4_add(accv, vi);
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
1297
|
+
wasm_i32x4_extract_lane(accv, 1) +
|
1298
|
+
wasm_i32x4_extract_lane(accv, 2) +
|
1299
|
+
wasm_i32x4_extract_lane(accv, 3));
|
1300
|
+
}
|
1227
1301
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1228
1302
|
for (int i = 0; i < nb; i++) {
|
1229
1303
|
// Load elements into 4 AVX vectors
|
@@ -1330,7 +1404,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
|
|
1330
1404
|
const int nb = k / qk;
|
1331
1405
|
|
1332
1406
|
for (int i = 0; i < nb; i++) {
|
1333
|
-
const float d = x[i].d;
|
1407
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1334
1408
|
|
1335
1409
|
for (int j = 0; j < qk/2; ++j) {
|
1336
1410
|
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
@@ -1350,8 +1424,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
|
|
1350
1424
|
const int nb = k / qk;
|
1351
1425
|
|
1352
1426
|
for (int i = 0; i < nb; i++) {
|
1353
|
-
const float d = x[i].d;
|
1354
|
-
const float m = x[i].m;
|
1427
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1428
|
+
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1355
1429
|
|
1356
1430
|
for (int j = 0; j < qk/2; ++j) {
|
1357
1431
|
const int x0 = (x[i].qs[j] & 0x0F);
|
@@ -1426,7 +1500,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
1426
1500
|
const block_q8_0 * restrict x = vx;
|
1427
1501
|
|
1428
1502
|
for (int i = 0; i < nb; i++) {
|
1429
|
-
const float d = x[i].d;
|
1503
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1430
1504
|
|
1431
1505
|
for (int j = 0; j < qk; ++j) {
|
1432
1506
|
y[i*qk + j] = x[i].qs[j]*d;
|
@@ -1690,8 +1764,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1690
1764
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
1691
1765
|
float tmp[8];
|
1692
1766
|
|
1693
|
-
for (int i = 0; i < 8; i++)
|
1767
|
+
for (int i = 0; i < 8; i++) {
|
1694
1768
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
1769
|
+
}
|
1695
1770
|
|
1696
1771
|
return _mm256_loadu_ps(tmp);
|
1697
1772
|
}
|
@@ -2111,8 +2186,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2111
2186
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
2112
2187
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
2113
2188
|
|
2114
|
-
const uint8x16_t m4b
|
2115
|
-
const int8x16_t s8b
|
2189
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2190
|
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
2116
2191
|
|
2117
2192
|
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2118
2193
|
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
@@ -2140,8 +2215,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2140
2215
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
2141
2216
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
2142
2217
|
|
2143
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2144
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2218
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2219
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2145
2220
|
#else
|
2146
2221
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
2147
2222
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
@@ -2158,8 +2233,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2158
2233
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2159
2234
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2160
2235
|
|
2161
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2162
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2236
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2237
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2163
2238
|
#endif
|
2164
2239
|
}
|
2165
2240
|
|
@@ -2171,7 +2246,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2171
2246
|
// Main loop
|
2172
2247
|
for (int i = 0; i < nb; ++i) {
|
2173
2248
|
/* Compute combined scale for the block */
|
2174
|
-
const __m256 d =
|
2249
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2175
2250
|
|
2176
2251
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2177
2252
|
|
@@ -2195,7 +2270,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2195
2270
|
// Main loop
|
2196
2271
|
for (int i = 0; i < nb; ++i) {
|
2197
2272
|
// Compute combined scale for the block
|
2198
|
-
const __m256 d =
|
2273
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2199
2274
|
|
2200
2275
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
2201
2276
|
const __m128i off = _mm_set1_epi8(8);
|
@@ -2237,7 +2312,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2237
2312
|
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
2238
2313
|
|
2239
2314
|
// Compute combined scale for the block 0 and 1
|
2240
|
-
const __m128 d_0_1 =
|
2315
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
|
2241
2316
|
|
2242
2317
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
2243
2318
|
|
@@ -2255,7 +2330,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2255
2330
|
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
2256
2331
|
|
2257
2332
|
// Compute combined scale for the block 2 and 3
|
2258
|
-
const __m128 d_2_3 =
|
2333
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
|
2259
2334
|
|
2260
2335
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
2261
2336
|
|
@@ -2288,7 +2363,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2288
2363
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
2289
2364
|
|
2290
2365
|
// Compute combined scale for the block 0 and 1
|
2291
|
-
const __m128 d_0_1 =
|
2366
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2292
2367
|
|
2293
2368
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
2294
2369
|
|
@@ -2306,7 +2381,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2306
2381
|
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
2307
2382
|
|
2308
2383
|
// Compute combined scale for the block 2 and 3
|
2309
|
-
const __m128 d_2_3 =
|
2384
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
|
2310
2385
|
|
2311
2386
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
2312
2387
|
|
@@ -2354,7 +2429,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2354
2429
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2355
2430
|
}
|
2356
2431
|
|
2357
|
-
sumf += (x[i].d*y[i].d)
|
2432
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2358
2433
|
}
|
2359
2434
|
|
2360
2435
|
*s = sumf;
|
@@ -2384,7 +2459,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2384
2459
|
const block_q8_1 * restrict y0 = &y[i + 0];
|
2385
2460
|
const block_q8_1 * restrict y1 = &y[i + 1];
|
2386
2461
|
|
2387
|
-
summs += x0->m * y0->s + x1->m * y1->s;
|
2462
|
+
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
2388
2463
|
|
2389
2464
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2390
2465
|
|
@@ -2408,8 +2483,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2408
2483
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
2409
2484
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
2410
2485
|
|
2411
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2412
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2486
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2487
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2413
2488
|
#else
|
2414
2489
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
2415
2490
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
@@ -2426,8 +2501,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2426
2501
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2427
2502
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2428
2503
|
|
2429
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2430
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2504
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2505
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2431
2506
|
#endif
|
2432
2507
|
}
|
2433
2508
|
|
@@ -2440,13 +2515,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2440
2515
|
|
2441
2516
|
// Main loop
|
2442
2517
|
for (int i = 0; i < nb; ++i) {
|
2443
|
-
const float
|
2444
|
-
const float
|
2518
|
+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
2519
|
+
const float d1 = y[i].d;
|
2445
2520
|
|
2446
|
-
summs += x[i].m * y[i].s;
|
2521
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
2447
2522
|
|
2448
|
-
const __m256 d0v =
|
2449
|
-
const __m256 d1v =
|
2523
|
+
const __m256 d0v = _mm256_set1_ps( d0 );
|
2524
|
+
const __m256 d1v = _mm256_set1_ps( d1 );
|
2450
2525
|
|
2451
2526
|
// Compute combined scales
|
2452
2527
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
@@ -2480,7 +2555,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2480
2555
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2481
2556
|
}
|
2482
2557
|
|
2483
|
-
sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
|
2558
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2484
2559
|
}
|
2485
2560
|
|
2486
2561
|
*s = sumf;
|
@@ -2556,16 +2631,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2556
2631
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2557
2632
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2558
2633
|
|
2559
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2560
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2561
|
-
|
2562
2634
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2563
2635
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2564
2636
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2565
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2637
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2566
2638
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2567
2639
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2568
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2640
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2569
2641
|
#else
|
2570
2642
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2571
2643
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2582,8 +2654,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2582
2654
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2583
2655
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2584
2656
|
|
2585
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2586
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2657
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2658
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2587
2659
|
#endif
|
2588
2660
|
}
|
2589
2661
|
|
@@ -2600,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2600
2672
|
const block_q8_0 * restrict y0 = &y[i];
|
2601
2673
|
|
2602
2674
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
2603
|
-
const v128_t s16b = wasm_i8x16_splat(0x10);
|
2604
2675
|
|
2605
2676
|
// extract the 5th bit
|
2606
2677
|
memcpy(&qh, x0->qh, sizeof(qh));
|
@@ -2638,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2638
2709
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2639
2710
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2640
2711
|
|
2641
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2642
|
-
|
2643
2712
|
// dot product
|
2644
2713
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
2645
2714
|
wasm_i32x4_add(
|
2646
2715
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2647
2716
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2648
2717
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2649
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2718
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2719
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
2650
2720
|
}
|
2651
2721
|
|
2652
2722
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2658,7 +2728,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2658
2728
|
// Main loop
|
2659
2729
|
for (int i = 0; i < nb; i++) {
|
2660
2730
|
/* Compute combined scale for the block */
|
2661
|
-
const __m256 d =
|
2731
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2662
2732
|
|
2663
2733
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2664
2734
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2682,7 +2752,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2682
2752
|
// Main loop
|
2683
2753
|
for (int i = 0; i < nb; i++) {
|
2684
2754
|
/* Compute combined scale for the block */
|
2685
|
-
const __m256 d =
|
2755
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2686
2756
|
|
2687
2757
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2688
2758
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2725,7 +2795,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2725
2795
|
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
2726
2796
|
}
|
2727
2797
|
|
2728
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
|
2798
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
2729
2799
|
}
|
2730
2800
|
|
2731
2801
|
*s = sumf;
|
@@ -2807,16 +2877,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2807
2877
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2808
2878
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2809
2879
|
|
2810
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2811
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2812
|
-
|
2813
2880
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2814
2881
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2815
2882
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2816
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2883
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2817
2884
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2818
2885
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2819
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2886
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2820
2887
|
#else
|
2821
2888
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2822
2889
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2833,8 +2900,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2833
2900
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2834
2901
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2835
2902
|
|
2836
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2837
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2903
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2904
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2838
2905
|
#endif
|
2839
2906
|
}
|
2840
2907
|
|
@@ -2873,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2873
2940
|
const v128_t v0l = wasm_v128_and (v0, m4b);
|
2874
2941
|
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
2875
2942
|
|
2876
|
-
static bool x = true;
|
2877
|
-
|
2878
2943
|
// add high bit
|
2879
2944
|
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
2880
2945
|
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
@@ -2894,15 +2959,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2894
2959
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2895
2960
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2896
2961
|
|
2897
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2898
|
-
|
2899
2962
|
// dot product
|
2900
|
-
sumv = wasm_f32x4_add(sumv,
|
2901
|
-
|
2963
|
+
sumv = wasm_f32x4_add(sumv,
|
2964
|
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
2902
2965
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2903
2966
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2904
2967
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2905
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2968
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2969
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
2906
2970
|
}
|
2907
2971
|
|
2908
2972
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2924,7 +2988,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2924
2988
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
2925
2989
|
bx = _mm256_or_si256(bx, bxhi);
|
2926
2990
|
|
2927
|
-
const __m256 dy =
|
2991
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2928
2992
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2929
2993
|
|
2930
2994
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -2958,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2958
3022
|
bxh = _mm_or_si128(bxh, bxhih);
|
2959
3023
|
bx = _mm256_set_m128i(bxh, bxl);
|
2960
3024
|
|
2961
|
-
const __m256 dy =
|
3025
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2962
3026
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2963
3027
|
|
2964
3028
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -3028,11 +3092,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3028
3092
|
#if defined(__ARM_FEATURE_DOTPROD)
|
3029
3093
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
3030
3094
|
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
3031
|
-
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
|
3095
|
+
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3032
3096
|
|
3033
3097
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
3034
3098
|
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
3035
|
-
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
|
3099
|
+
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3036
3100
|
|
3037
3101
|
#else
|
3038
3102
|
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
@@ -3050,8 +3114,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3050
3114
|
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
3051
3115
|
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
3052
3116
|
|
3053
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
|
3054
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
|
3117
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3118
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3055
3119
|
#endif
|
3056
3120
|
}
|
3057
3121
|
|
@@ -3063,7 +3127,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3063
3127
|
// Main loop
|
3064
3128
|
for (int i = 0; i < nb; ++i) {
|
3065
3129
|
// Compute combined scale for the block
|
3066
|
-
const __m256 d =
|
3130
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
3067
3131
|
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
3068
3132
|
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3069
3133
|
|
@@ -3089,7 +3153,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3089
3153
|
sumi += x[i].qs[j]*y[i].qs[j];
|
3090
3154
|
}
|
3091
3155
|
|
3092
|
-
sumf += (x[i].d*y[i].d)
|
3156
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3093
3157
|
}
|
3094
3158
|
|
3095
3159
|
*s = sumf;
|
@@ -3478,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3478
3542
|
"ROPE",
|
3479
3543
|
"ROPE_BACK",
|
3480
3544
|
"ALIBI",
|
3545
|
+
"CLAMP",
|
3481
3546
|
"CONV_1D_1S",
|
3482
3547
|
"CONV_1D_2S",
|
3483
3548
|
|
@@ -3488,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3488
3553
|
"MAP_BINARY",
|
3489
3554
|
};
|
3490
3555
|
|
3491
|
-
static_assert(GGML_OP_COUNT ==
|
3556
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3557
|
+
|
3492
3558
|
|
3493
3559
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3494
3560
|
"none",
|
@@ -3538,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3538
3604
|
"rope(x)",
|
3539
3605
|
"rope_back(x)",
|
3540
3606
|
"alibi(x)",
|
3607
|
+
"clamp(x)",
|
3541
3608
|
"conv_1d_1s(x)",
|
3542
3609
|
"conv_1d_2s(x)",
|
3543
3610
|
|
@@ -3548,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3548
3615
|
"f(x,y)",
|
3549
3616
|
};
|
3550
3617
|
|
3551
|
-
static_assert(GGML_OP_COUNT ==
|
3618
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3552
3619
|
|
3553
3620
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3554
3621
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3782,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
|
|
3782
3849
|
(t1->ne[3]%t0->ne[3] == 0);
|
3783
3850
|
}
|
3784
3851
|
|
3852
|
+
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3853
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3854
|
+
|
3855
|
+
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
|
3856
|
+
}
|
3857
|
+
|
3785
3858
|
static inline int ggml_up32(int n) {
|
3786
3859
|
return (n + 31) & ~31;
|
3787
3860
|
}
|
@@ -4664,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
|
|
4664
4737
|
struct ggml_tensor * a,
|
4665
4738
|
struct ggml_tensor * b,
|
4666
4739
|
bool inplace) {
|
4667
|
-
|
4740
|
+
// TODO: support less-strict constraint
|
4741
|
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
4742
|
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
4668
4743
|
|
4669
4744
|
bool is_node = false;
|
4670
4745
|
|
4671
4746
|
if (!inplace && (a->grad || b->grad)) {
|
4747
|
+
// TODO: support backward pass for broadcasting
|
4748
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
4672
4749
|
is_node = true;
|
4673
4750
|
}
|
4674
4751
|
|
@@ -6210,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6210
6287
|
struct ggml_context * ctx,
|
6211
6288
|
struct ggml_tensor * a,
|
6212
6289
|
int n_past,
|
6213
|
-
int n_head
|
6290
|
+
int n_head,
|
6291
|
+
float bias_max) {
|
6214
6292
|
GGML_ASSERT(n_past >= 0);
|
6215
6293
|
bool is_node = false;
|
6216
6294
|
|
@@ -6229,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6229
6307
|
|
6230
6308
|
((int32_t *) b->data)[0] = n_past;
|
6231
6309
|
((int32_t *) b->data)[1] = n_head;
|
6310
|
+
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
|
6311
|
+
(((float *) b->data)[2]) = bias_max;
|
6232
6312
|
|
6233
6313
|
ggml_scratch_load(ctx);
|
6234
6314
|
|
@@ -6240,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
|
|
6240
6320
|
return result;
|
6241
6321
|
}
|
6242
6322
|
|
6323
|
+
// ggml_clamp
|
6324
|
+
|
6325
|
+
struct ggml_tensor * ggml_clamp(
|
6326
|
+
struct ggml_context * ctx,
|
6327
|
+
struct ggml_tensor * a,
|
6328
|
+
float min,
|
6329
|
+
float max) {
|
6330
|
+
bool is_node = false;
|
6331
|
+
|
6332
|
+
if (a->grad) {
|
6333
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6334
|
+
is_node = true;
|
6335
|
+
}
|
6336
|
+
|
6337
|
+
// TODO: when implement backward, fix this:
|
6338
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6339
|
+
|
6340
|
+
ggml_scratch_save(ctx);
|
6341
|
+
|
6342
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6343
|
+
|
6344
|
+
((float *) b->data)[0] = min;
|
6345
|
+
((float *) b->data)[1] = max;
|
6346
|
+
|
6347
|
+
ggml_scratch_load(ctx);
|
6348
|
+
|
6349
|
+
result->op = GGML_OP_CLAMP;
|
6350
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6351
|
+
result->src0 = a;
|
6352
|
+
result->src1 = b;
|
6353
|
+
|
6354
|
+
return result;
|
6355
|
+
}
|
6356
|
+
|
6243
6357
|
// ggml_conv_1d_1s
|
6244
6358
|
|
6245
6359
|
struct ggml_tensor * ggml_conv_1d_1s(
|
@@ -7966,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7966
8080
|
const struct ggml_tensor * src0,
|
7967
8081
|
const struct ggml_tensor * src1,
|
7968
8082
|
struct ggml_tensor * dst) {
|
7969
|
-
|
8083
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
7970
8084
|
|
7971
8085
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7972
8086
|
return;
|
@@ -7974,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
|
|
7974
8088
|
const int ith = params->ith;
|
7975
8089
|
const int nth = params->nth;
|
7976
8090
|
|
7977
|
-
|
7978
|
-
|
7979
|
-
|
7980
|
-
|
8091
|
+
#ifdef GGML_USE_CUBLAS
|
8092
|
+
if (src1->backend == GGML_BACKEND_CUDA) {
|
8093
|
+
if (ith == 0) {
|
8094
|
+
ggml_cuda_mul(src0, src1, dst);
|
8095
|
+
}
|
8096
|
+
return;
|
8097
|
+
}
|
8098
|
+
#endif
|
8099
|
+
|
8100
|
+
const int64_t nr = ggml_nrows(src0);
|
8101
|
+
|
8102
|
+
const int64_t ne00 = src0->ne[0];
|
8103
|
+
const int64_t ne01 = src0->ne[1];
|
8104
|
+
const int64_t ne02 = src0->ne[2];
|
8105
|
+
|
8106
|
+
const int64_t ne10 = src1->ne[0];
|
8107
|
+
const int64_t ne11 = src1->ne[1];
|
8108
|
+
const int64_t ne12 = src1->ne[2];
|
8109
|
+
const int64_t ne13 = src1->ne[3];
|
7981
8110
|
|
7982
8111
|
const size_t nb00 = src0->nb[0];
|
7983
8112
|
const size_t nb01 = src0->nb[1];
|
@@ -7996,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
|
|
7996
8125
|
|
7997
8126
|
GGML_ASSERT( nb0 == sizeof(float));
|
7998
8127
|
GGML_ASSERT(nb00 == sizeof(float));
|
8128
|
+
GGML_ASSERT(ne00 == ne10);
|
7999
8129
|
|
8000
8130
|
if (nb10 == sizeof(float)) {
|
8001
|
-
for (
|
8002
|
-
// src0
|
8003
|
-
const
|
8004
|
-
const
|
8005
|
-
const
|
8131
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8132
|
+
// src0 and dst are same shape => same indices
|
8133
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8134
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8135
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8136
|
+
|
8137
|
+
const int64_t i13 = i03 % ne13;
|
8138
|
+
const int64_t i12 = i02 % ne12;
|
8139
|
+
const int64_t i11 = i01 % ne11;
|
8006
8140
|
|
8141
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8142
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8143
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
8007
8144
|
|
8008
8145
|
#ifdef GGML_USE_ACCELERATE
|
8009
8146
|
UNUSED(ggml_vec_mul_f32);
|
8010
8147
|
|
8011
|
-
vDSP_vmul(
|
8012
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
8013
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
8014
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
8015
|
-
ne0);
|
8148
|
+
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
8016
8149
|
#else
|
8017
|
-
ggml_vec_mul_f32(
|
8018
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
8019
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
8020
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
8150
|
+
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
8021
8151
|
#endif
|
8022
8152
|
// }
|
8023
8153
|
// }
|
8024
8154
|
}
|
8025
8155
|
} else {
|
8026
8156
|
// src1 is not contiguous
|
8027
|
-
for (
|
8028
|
-
// src0
|
8029
|
-
|
8030
|
-
const
|
8031
|
-
const
|
8157
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8158
|
+
// src0 and dst are same shape => same indices
|
8159
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8160
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8161
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8162
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8032
8163
|
|
8033
|
-
|
8034
|
-
|
8035
|
-
|
8036
|
-
|
8164
|
+
const int64_t i13 = i03 % ne13;
|
8165
|
+
const int64_t i12 = i02 % ne12;
|
8166
|
+
const int64_t i11 = i01 % ne11;
|
8167
|
+
|
8168
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8169
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8170
|
+
|
8171
|
+
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
8172
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
8037
8173
|
|
8038
8174
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
8039
8175
|
}
|
@@ -9295,7 +9431,7 @@ static void ggml_compute_forward_rms_norm_back(
|
|
9295
9431
|
|
9296
9432
|
// ggml_compute_forward_mul_mat
|
9297
9433
|
|
9298
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9434
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9299
9435
|
// helper function to determine if it is better to use BLAS or not
|
9300
9436
|
// for large matrices, BLAS is faster
|
9301
9437
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
@@ -9336,7 +9472,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9336
9472
|
const int64_t ne02 = src0->ne[2];
|
9337
9473
|
const int64_t ne03 = src0->ne[3];
|
9338
9474
|
|
9339
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9475
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9340
9476
|
const int64_t ne10 = src1->ne[0];
|
9341
9477
|
#endif
|
9342
9478
|
const int64_t ne11 = src1->ne[1];
|
@@ -9400,9 +9536,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9400
9536
|
}
|
9401
9537
|
return;
|
9402
9538
|
}
|
9539
|
+
#elif defined(GGML_USE_CLBLAST)
|
9540
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9541
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9542
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9543
|
+
}
|
9544
|
+
return;
|
9545
|
+
}
|
9403
9546
|
#endif
|
9404
9547
|
|
9405
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9548
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9406
9549
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9407
9550
|
if (params->ith != 0) {
|
9408
9551
|
return;
|
@@ -9422,21 +9565,11 @@ static void ggml_compute_forward_mul_mat_f32(
|
|
9422
9565
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9423
9566
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9424
9567
|
|
9425
|
-
#if defined(GGML_USE_CLBLAST)
|
9426
|
-
// zT = y * xT
|
9427
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9428
|
-
ne11, ne01, ne10,
|
9429
|
-
1.0f, y, ne10,
|
9430
|
-
x, ne10,
|
9431
|
-
0.0f, d, ne01,
|
9432
|
-
GGML_TYPE_F32);
|
9433
|
-
#else
|
9434
9568
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9435
9569
|
ne11, ne01, ne10,
|
9436
9570
|
1.0f, y, ne10,
|
9437
9571
|
x, ne00,
|
9438
9572
|
0.0f, d, ne01);
|
9439
|
-
#endif
|
9440
9573
|
}
|
9441
9574
|
}
|
9442
9575
|
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
@@ -9575,9 +9708,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9575
9708
|
}
|
9576
9709
|
return;
|
9577
9710
|
}
|
9711
|
+
#elif defined(GGML_USE_CLBLAST)
|
9712
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9713
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9714
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9715
|
+
}
|
9716
|
+
return;
|
9717
|
+
}
|
9578
9718
|
#endif
|
9579
9719
|
|
9580
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9720
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9581
9721
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9582
9722
|
GGML_ASSERT(nb10 == sizeof(float));
|
9583
9723
|
|
@@ -9607,20 +9747,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9607
9747
|
assert(id*sizeof(float) <= params->wsize);
|
9608
9748
|
}
|
9609
9749
|
|
9610
|
-
#if defined(GGML_USE_CLBLAST)
|
9611
|
-
const float * x = wdata;
|
9612
|
-
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9613
|
-
|
9614
|
-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9615
|
-
|
9616
|
-
// zT = y * xT
|
9617
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9618
|
-
ne11, ne01, ne10,
|
9619
|
-
1.0f, y, ne10,
|
9620
|
-
x, ne10,
|
9621
|
-
0.0f, d, ne01,
|
9622
|
-
GGML_TYPE_F32);
|
9623
|
-
#else
|
9624
9750
|
const float * x = wdata;
|
9625
9751
|
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
9626
9752
|
|
@@ -9632,7 +9758,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
9632
9758
|
1.0f, y, ne10,
|
9633
9759
|
x, ne00,
|
9634
9760
|
0.0f, d, ne01);
|
9635
|
-
#endif
|
9636
9761
|
}
|
9637
9762
|
}
|
9638
9763
|
|
@@ -9795,9 +9920,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9795
9920
|
}
|
9796
9921
|
return;
|
9797
9922
|
}
|
9923
|
+
#elif defined(GGML_USE_CLBLAST)
|
9924
|
+
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
9925
|
+
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
9926
|
+
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
9927
|
+
}
|
9928
|
+
return;
|
9929
|
+
}
|
9798
9930
|
#endif
|
9799
9931
|
|
9800
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9932
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9801
9933
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
9802
9934
|
if (params->ith != 0) {
|
9803
9935
|
return;
|
@@ -9820,9 +9952,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9820
9952
|
|
9821
9953
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
9822
9954
|
|
9823
|
-
#if defined(GGML_USE_CLBLAST)
|
9824
|
-
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
9825
|
-
#else
|
9826
9955
|
{
|
9827
9956
|
size_t id = 0;
|
9828
9957
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
@@ -9834,23 +9963,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|
9834
9963
|
}
|
9835
9964
|
|
9836
9965
|
const float * x = wdata;
|
9837
|
-
#endif
|
9838
9966
|
|
9839
|
-
#if defined(GGML_USE_CLBLAST)
|
9840
|
-
// zT = y * xT
|
9841
|
-
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
9842
|
-
ne11, ne01, ne10,
|
9843
|
-
1.0f, y, ne10,
|
9844
|
-
x, ne10,
|
9845
|
-
0.0f, d, ne01,
|
9846
|
-
type);
|
9847
|
-
#else
|
9848
9967
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
9849
9968
|
ne11, ne01, ne10,
|
9850
9969
|
1.0f, y, ne10,
|
9851
9970
|
x, ne00,
|
9852
9971
|
0.0f, d, ne01);
|
9853
|
-
#endif
|
9854
9972
|
}
|
9855
9973
|
}
|
9856
9974
|
|
@@ -10527,6 +10645,7 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
10527
10645
|
|
10528
10646
|
const int n_past = ((int32_t *) src1->data)[0];
|
10529
10647
|
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
10648
|
+
|
10530
10649
|
assert(n_past >= 0);
|
10531
10650
|
|
10532
10651
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
@@ -10697,14 +10816,15 @@ static void ggml_compute_forward_alibi_f32(
|
|
10697
10816
|
struct ggml_tensor * dst) {
|
10698
10817
|
assert(params->ith == 0);
|
10699
10818
|
assert(src1->type == GGML_TYPE_I32);
|
10700
|
-
assert(ggml_nelements(src1) ==
|
10819
|
+
assert(ggml_nelements(src1) == 3);
|
10701
10820
|
|
10702
10821
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10703
10822
|
return;
|
10704
10823
|
}
|
10705
10824
|
|
10706
|
-
const int
|
10707
|
-
const int
|
10825
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10826
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10827
|
+
const float max_bias = ((float *) src1->data)[2];
|
10708
10828
|
|
10709
10829
|
assert(n_past >= 0);
|
10710
10830
|
|
@@ -10727,8 +10847,8 @@ static void ggml_compute_forward_alibi_f32(
|
|
10727
10847
|
// add alibi to src0 (KQ_scaled)
|
10728
10848
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10729
10849
|
|
10730
|
-
const float m0 = powf(2.0f, -
|
10731
|
-
const float m1 = powf(2.0f, -
|
10850
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10851
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10732
10852
|
|
10733
10853
|
for (int i = 0; i < ne0; i++) {
|
10734
10854
|
for (int j = 0; j < ne1; j++) {
|
@@ -10746,13 +10866,13 @@ static void ggml_compute_forward_alibi_f32(
|
|
10746
10866
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
10747
10867
|
}
|
10748
10868
|
|
10749
|
-
pdst[0] = i * m_k + src[0];
|
10869
|
+
pdst[0] = (i-ne0+1) * m_k + src[0];
|
10870
|
+
|
10750
10871
|
}
|
10751
10872
|
}
|
10752
10873
|
}
|
10753
10874
|
}
|
10754
10875
|
|
10755
|
-
|
10756
10876
|
static void ggml_compute_forward_alibi_f16(
|
10757
10877
|
const struct ggml_compute_params * params,
|
10758
10878
|
const struct ggml_tensor * src0,
|
@@ -10760,14 +10880,15 @@ static void ggml_compute_forward_alibi_f16(
|
|
10760
10880
|
struct ggml_tensor * dst) {
|
10761
10881
|
assert(params->ith == 0);
|
10762
10882
|
assert(src1->type == GGML_TYPE_I32);
|
10763
|
-
assert(ggml_nelements(src1) ==
|
10883
|
+
assert(ggml_nelements(src1) == 3);
|
10764
10884
|
|
10765
10885
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10766
10886
|
return;
|
10767
10887
|
}
|
10768
10888
|
|
10769
|
-
const int
|
10770
|
-
const int
|
10889
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10890
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10891
|
+
const float max_bias = ((float *) src1->data)[2];
|
10771
10892
|
|
10772
10893
|
assert(n_past >= 0);
|
10773
10894
|
|
@@ -10790,8 +10911,8 @@ static void ggml_compute_forward_alibi_f16(
|
|
10790
10911
|
// add alibi to src0 (KQ_scaled)
|
10791
10912
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10792
10913
|
|
10793
|
-
const float m0 = powf(2.0f, -
|
10794
|
-
const float m1 = powf(2.0f, -
|
10914
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10915
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10795
10916
|
|
10796
10917
|
for (int i = 0; i < ne0; i++) {
|
10797
10918
|
for (int j = 0; j < ne1; j++) {
|
@@ -10810,7 +10931,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
10810
10931
|
}
|
10811
10932
|
|
10812
10933
|
// we return F32
|
10813
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
10934
|
+
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
10814
10935
|
}
|
10815
10936
|
}
|
10816
10937
|
}
|
@@ -10846,6 +10967,77 @@ static void ggml_compute_forward_alibi(
|
|
10846
10967
|
}
|
10847
10968
|
}
|
10848
10969
|
|
10970
|
+
|
10971
|
+
// ggml_compute_forward_clamp
|
10972
|
+
|
10973
|
+
static void ggml_compute_forward_clamp_f32(
|
10974
|
+
const struct ggml_compute_params * params,
|
10975
|
+
const struct ggml_tensor * src0,
|
10976
|
+
const struct ggml_tensor * src1,
|
10977
|
+
struct ggml_tensor * dst) {
|
10978
|
+
assert(params->ith == 0);
|
10979
|
+
assert(src1->type == GGML_TYPE_I32);
|
10980
|
+
assert(ggml_nelements(src1) == 2);
|
10981
|
+
|
10982
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10983
|
+
return;
|
10984
|
+
}
|
10985
|
+
|
10986
|
+
const int min = ((float *) src1->data)[0];
|
10987
|
+
const int max = ((float *) src1->data)[1];
|
10988
|
+
|
10989
|
+
const int ith = params->ith;
|
10990
|
+
const int nth = params->nth;
|
10991
|
+
|
10992
|
+
const int n = ggml_nrows(src0);
|
10993
|
+
const int nc = src0->ne[0];
|
10994
|
+
|
10995
|
+
const size_t nb00 = src0->nb[0];
|
10996
|
+
const size_t nb01 = src0->nb[1];
|
10997
|
+
|
10998
|
+
const size_t nb0 = dst->nb[0];
|
10999
|
+
const size_t nb1 = dst->nb[1];
|
11000
|
+
|
11001
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
11002
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
11003
|
+
|
11004
|
+
for (int j = ith; j < n; j += nth) {
|
11005
|
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
11006
|
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
11007
|
+
|
11008
|
+
for (int i = 0; i < nc; i++) {
|
11009
|
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
11010
|
+
}
|
11011
|
+
}
|
11012
|
+
}
|
11013
|
+
|
11014
|
+
static void ggml_compute_forward_clamp(
|
11015
|
+
const struct ggml_compute_params * params,
|
11016
|
+
const struct ggml_tensor * src0,
|
11017
|
+
const struct ggml_tensor * src1,
|
11018
|
+
struct ggml_tensor * dst) {
|
11019
|
+
switch (src0->type) {
|
11020
|
+
case GGML_TYPE_F32:
|
11021
|
+
{
|
11022
|
+
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
|
11023
|
+
} break;
|
11024
|
+
case GGML_TYPE_F16:
|
11025
|
+
case GGML_TYPE_Q4_0:
|
11026
|
+
case GGML_TYPE_Q4_1:
|
11027
|
+
case GGML_TYPE_Q5_0:
|
11028
|
+
case GGML_TYPE_Q5_1:
|
11029
|
+
case GGML_TYPE_Q8_0:
|
11030
|
+
case GGML_TYPE_Q8_1:
|
11031
|
+
case GGML_TYPE_I8:
|
11032
|
+
case GGML_TYPE_I16:
|
11033
|
+
case GGML_TYPE_I32:
|
11034
|
+
case GGML_TYPE_COUNT:
|
11035
|
+
{
|
11036
|
+
GGML_ASSERT(false);
|
11037
|
+
} break;
|
11038
|
+
}
|
11039
|
+
}
|
11040
|
+
|
10849
11041
|
// ggml_compute_forward_rope
|
10850
11042
|
|
10851
11043
|
static void ggml_compute_forward_rope_f32(
|
@@ -12827,6 +13019,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
12827
13019
|
{
|
12828
13020
|
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
12829
13021
|
} break;
|
13022
|
+
case GGML_OP_CLAMP:
|
13023
|
+
{
|
13024
|
+
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
13025
|
+
} break;
|
12830
13026
|
case GGML_OP_CONV_1D_1S:
|
12831
13027
|
{
|
12832
13028
|
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
@@ -13134,6 +13330,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
13134
13330
|
{
|
13135
13331
|
GGML_ASSERT(false); // TODO: not implemented
|
13136
13332
|
} break;
|
13333
|
+
case GGML_OP_CLAMP:
|
13334
|
+
{
|
13335
|
+
GGML_ASSERT(false); // TODO: not implemented
|
13336
|
+
} break;
|
13137
13337
|
case GGML_OP_SILU:
|
13138
13338
|
{
|
13139
13339
|
// necessary for llama
|
@@ -13947,9 +14147,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
13947
14147
|
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
13948
14148
|
}
|
13949
14149
|
else
|
14150
|
+
#elif defined(GGML_USE_CLBLAST)
|
14151
|
+
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
14152
|
+
node->n_tasks = 1; // TODO: this actually is doing nothing
|
14153
|
+
// the threads are still spinning
|
14154
|
+
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
14155
|
+
}
|
14156
|
+
else
|
13950
14157
|
#endif
|
13951
14158
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
13952
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14159
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13953
14160
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
13954
14161
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
13955
14162
|
// the threads are still spinning
|
@@ -13963,13 +14170,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
13963
14170
|
#endif
|
13964
14171
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
13965
14172
|
cur = 0;
|
13966
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14173
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13967
14174
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
13968
14175
|
node->n_tasks = 1;
|
13969
14176
|
}
|
13970
14177
|
#endif
|
13971
14178
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
13972
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
14179
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13973
14180
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
13974
14181
|
node->n_tasks = 1;
|
13975
14182
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
@@ -14013,6 +14220,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14013
14220
|
{
|
14014
14221
|
node->n_tasks = 1; //TODO
|
14015
14222
|
} break;
|
14223
|
+
case GGML_OP_CLAMP:
|
14224
|
+
{
|
14225
|
+
node->n_tasks = 1; //TODO
|
14226
|
+
} break;
|
14016
14227
|
case GGML_OP_CONV_1D_1S:
|
14017
14228
|
case GGML_OP_CONV_1D_2S:
|
14018
14229
|
{
|
@@ -14409,9 +14620,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
14409
14620
|
fprintf(fp, "%s |", node->name);
|
14410
14621
|
}
|
14411
14622
|
|
14412
|
-
|
14413
|
-
|
14414
|
-
|
14623
|
+
if (node->n_dims == 2) {
|
14624
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
|
14625
|
+
} else {
|
14626
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
14627
|
+
}
|
14628
|
+
|
14415
14629
|
|
14416
14630
|
if (node->grad) {
|
14417
14631
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|