llama_cpp 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -6
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +246 -133
- data/ext/llama_cpp/src/ggml.c +362 -137
- data/ext/llama_cpp/src/ggml.h +13 -3
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +173 -102
- data/ext/llama_cpp/src/llama.h +30 -17
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
|
512
512
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
513
513
|
}
|
514
514
|
|
515
|
-
#if __AVX2__ || __AVX512F__
|
515
|
+
#if defined(__AVX2__) || defined(__AVX512F__)
|
516
516
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
517
517
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
518
518
|
uint32_t x32;
|
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
688
688
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
689
689
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
690
690
|
|
691
|
-
#if __ARM_NEON
|
691
|
+
#if defined(__ARM_NEON)
|
692
692
|
|
693
693
|
#if !defined(__aarch64__)
|
694
694
|
|
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
740
740
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
741
741
|
}
|
742
742
|
|
743
|
-
float vminvq_f32(float32x4_t v) {
|
743
|
+
inline static float vminvq_f32(float32x4_t v) {
|
744
744
|
return
|
745
745
|
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
746
746
|
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
747
747
|
}
|
748
748
|
|
749
|
-
float vmaxvq_f32(float32x4_t v) {
|
749
|
+
inline static float vmaxvq_f32(float32x4_t v) {
|
750
750
|
return
|
751
751
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
752
752
|
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
753
753
|
}
|
754
754
|
|
755
|
-
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
755
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
756
756
|
int32x4_t res;
|
757
757
|
|
758
758
|
res[0] = roundf(vgetq_lane_f32(v, 0));
|
@@ -766,21 +766,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
766
766
|
#endif
|
767
767
|
#endif
|
768
768
|
|
769
|
-
|
770
769
|
#define QK4_0 32
|
771
770
|
typedef struct {
|
772
|
-
|
771
|
+
ggml_fp16_t d; // delta
|
773
772
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
774
773
|
} block_q4_0;
|
775
|
-
static_assert(sizeof(block_q4_0) == sizeof(
|
774
|
+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
776
775
|
|
777
776
|
#define QK4_1 32
|
778
777
|
typedef struct {
|
779
|
-
|
780
|
-
|
778
|
+
ggml_fp16_t d; // delta
|
779
|
+
ggml_fp16_t m; // min
|
781
780
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
782
781
|
} block_q4_1;
|
783
|
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(
|
782
|
+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
784
783
|
|
785
784
|
#define QK5_0 32
|
786
785
|
typedef struct {
|
@@ -801,16 +800,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
|
801
800
|
|
802
801
|
#define QK8_0 32
|
803
802
|
typedef struct {
|
804
|
-
|
805
|
-
int8_t qs[QK8_0];
|
803
|
+
ggml_fp16_t d; // delta
|
804
|
+
int8_t qs[QK8_0]; // quants
|
806
805
|
} block_q8_0;
|
807
|
-
static_assert(sizeof(block_q8_0) == sizeof(
|
806
|
+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
808
807
|
|
809
808
|
#define QK8_1 32
|
810
809
|
typedef struct {
|
811
|
-
float
|
812
|
-
float
|
813
|
-
int8_t qs[QK8_1];
|
810
|
+
float d; // delta
|
811
|
+
float s; // d * sum(qs[i])
|
812
|
+
int8_t qs[QK8_1]; // quants
|
814
813
|
} block_q8_1;
|
815
814
|
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
816
815
|
|
@@ -837,7 +836,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
|
|
837
836
|
const float d = max / -8;
|
838
837
|
const float id = d ? 1.0f/d : 0.0f;
|
839
838
|
|
840
|
-
y[i].d = d;
|
839
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
841
840
|
|
842
841
|
for (int j = 0; j < qk/2; ++j) {
|
843
842
|
const float x0 = x[i*qk + 0 + j]*id;
|
@@ -877,8 +876,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
|
|
877
876
|
const float d = (max - min) / ((1 << 4) - 1);
|
878
877
|
const float id = d ? 1.0f/d : 0.0f;
|
879
878
|
|
880
|
-
y[i].d = d;
|
881
|
-
y[i].m = min;
|
879
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
880
|
+
y[i].m = GGML_FP32_TO_FP16(min);
|
882
881
|
|
883
882
|
for (int j = 0; j < qk/2; ++j) {
|
884
883
|
const float x0 = (x[i*qk + 0 + j] - min)*id;
|
@@ -1009,7 +1008,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
|
1009
1008
|
const float d = amax / ((1 << 7) - 1);
|
1010
1009
|
const float id = d ? 1.0f/d : 0.0f;
|
1011
1010
|
|
1012
|
-
y[i].d = d;
|
1011
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1013
1012
|
|
1014
1013
|
for (int j = 0; j < QK8_0; ++j) {
|
1015
1014
|
const float x0 = x[i*QK8_0 + j]*id;
|
@@ -1044,7 +1043,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1044
1043
|
const float d = amax / ((1 << 7) - 1);
|
1045
1044
|
const float id = d ? 1.0f/d : 0.0f;
|
1046
1045
|
|
1047
|
-
y[i].d = d;
|
1046
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1048
1047
|
|
1049
1048
|
for (int j = 0; j < 8; j++) {
|
1050
1049
|
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1056
1055
|
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
1057
1056
|
}
|
1058
1057
|
}
|
1058
|
+
#elif defined(__wasm_simd128__)
|
1059
|
+
for (int i = 0; i < nb; i++) {
|
1060
|
+
v128_t srcv [8];
|
1061
|
+
v128_t asrcv[8];
|
1062
|
+
v128_t amaxv[8];
|
1063
|
+
|
1064
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1065
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1066
|
+
|
1067
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1068
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1069
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1070
|
+
|
1071
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1072
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1073
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1074
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1075
|
+
|
1076
|
+
const float d = amax / ((1 << 7) - 1);
|
1077
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1078
|
+
|
1079
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1080
|
+
|
1081
|
+
for (int j = 0; j < 8; j++) {
|
1082
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1083
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1084
|
+
|
1085
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1086
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1087
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1088
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1089
|
+
}
|
1090
|
+
}
|
1059
1091
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1060
1092
|
for (int i = 0; i < nb; i++) {
|
1061
1093
|
// Load elements into 4 AVX vectors
|
@@ -1079,7 +1111,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
|
1079
1111
|
|
1080
1112
|
// Quantize these floats
|
1081
1113
|
const float d = maxScalar / 127.f;
|
1082
|
-
y[i].d = d;
|
1114
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
1083
1115
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
1084
1116
|
const __m256 mul = _mm256_set1_ps( id );
|
1085
1117
|
|
@@ -1178,7 +1210,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
|
|
1178
1210
|
sum += y[i].qs[QK8_1/2 + j];
|
1179
1211
|
}
|
1180
1212
|
|
1181
|
-
y[i].s = d
|
1213
|
+
y[i].s = sum*d;
|
1182
1214
|
}
|
1183
1215
|
}
|
1184
1216
|
|
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
|
1224
1256
|
|
1225
1257
|
y[i].s = d * vaddvq_s32(accv);
|
1226
1258
|
}
|
1259
|
+
#elif defined(__wasm_simd128__)
|
1260
|
+
for (int i = 0; i < nb; i++) {
|
1261
|
+
v128_t srcv [8];
|
1262
|
+
v128_t asrcv[8];
|
1263
|
+
v128_t amaxv[8];
|
1264
|
+
|
1265
|
+
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
1266
|
+
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
1267
|
+
|
1268
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
1269
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
1270
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
1271
|
+
|
1272
|
+
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
1273
|
+
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
1274
|
+
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
1275
|
+
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
1276
|
+
|
1277
|
+
const float d = amax / ((1 << 7) - 1);
|
1278
|
+
const float id = d ? 1.0f/d : 0.0f;
|
1279
|
+
|
1280
|
+
y[i].d = d;
|
1281
|
+
|
1282
|
+
v128_t accv = wasm_i32x4_splat(0);
|
1283
|
+
|
1284
|
+
for (int j = 0; j < 8; j++) {
|
1285
|
+
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
1286
|
+
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
1287
|
+
|
1288
|
+
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
1289
|
+
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
1290
|
+
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
1291
|
+
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
1292
|
+
|
1293
|
+
accv = wasm_i32x4_add(accv, vi);
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
1297
|
+
wasm_i32x4_extract_lane(accv, 1) +
|
1298
|
+
wasm_i32x4_extract_lane(accv, 2) +
|
1299
|
+
wasm_i32x4_extract_lane(accv, 3));
|
1300
|
+
}
|
1227
1301
|
#elif defined(__AVX2__) || defined(__AVX__)
|
1228
1302
|
for (int i = 0; i < nb; i++) {
|
1229
1303
|
// Load elements into 4 AVX vectors
|
@@ -1330,7 +1404,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
|
|
1330
1404
|
const int nb = k / qk;
|
1331
1405
|
|
1332
1406
|
for (int i = 0; i < nb; i++) {
|
1333
|
-
const float d = x[i].d;
|
1407
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1334
1408
|
|
1335
1409
|
for (int j = 0; j < qk/2; ++j) {
|
1336
1410
|
const int x0 = (x[i].qs[j] & 0x0F) - 8;
|
@@ -1350,8 +1424,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
|
|
1350
1424
|
const int nb = k / qk;
|
1351
1425
|
|
1352
1426
|
for (int i = 0; i < nb; i++) {
|
1353
|
-
const float d = x[i].d;
|
1354
|
-
const float m = x[i].m;
|
1427
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1428
|
+
const float m = GGML_FP16_TO_FP32(x[i].m);
|
1355
1429
|
|
1356
1430
|
for (int j = 0; j < qk/2; ++j) {
|
1357
1431
|
const int x0 = (x[i].qs[j] & 0x0F);
|
@@ -1426,7 +1500,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
|
|
1426
1500
|
const block_q8_0 * restrict x = vx;
|
1427
1501
|
|
1428
1502
|
for (int i = 0; i < nb; i++) {
|
1429
|
-
const float d = x[i].d;
|
1503
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
1430
1504
|
|
1431
1505
|
for (int j = 0; j < qk; ++j) {
|
1432
1506
|
y[i*qk + j] = x[i].qs[j]*d;
|
@@ -1690,8 +1764,9 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1690
1764
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
1691
1765
|
float tmp[8];
|
1692
1766
|
|
1693
|
-
for (int i = 0; i < 8; i++)
|
1767
|
+
for (int i = 0; i < 8; i++) {
|
1694
1768
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
1769
|
+
}
|
1695
1770
|
|
1696
1771
|
return _mm256_loadu_ps(tmp);
|
1697
1772
|
}
|
@@ -2111,8 +2186,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2111
2186
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
2112
2187
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
2113
2188
|
|
2114
|
-
const uint8x16_t m4b
|
2115
|
-
const int8x16_t s8b
|
2189
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2190
|
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
2116
2191
|
|
2117
2192
|
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
2118
2193
|
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
@@ -2140,8 +2215,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2140
2215
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
2141
2216
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
2142
2217
|
|
2143
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2144
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2218
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2219
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2145
2220
|
#else
|
2146
2221
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
2147
2222
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
@@ -2158,8 +2233,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2158
2233
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2159
2234
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2160
2235
|
|
2161
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2162
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2236
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2237
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2163
2238
|
#endif
|
2164
2239
|
}
|
2165
2240
|
|
@@ -2171,7 +2246,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2171
2246
|
// Main loop
|
2172
2247
|
for (int i = 0; i < nb; ++i) {
|
2173
2248
|
/* Compute combined scale for the block */
|
2174
|
-
const __m256 d =
|
2249
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2175
2250
|
|
2176
2251
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2177
2252
|
|
@@ -2195,7 +2270,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2195
2270
|
// Main loop
|
2196
2271
|
for (int i = 0; i < nb; ++i) {
|
2197
2272
|
// Compute combined scale for the block
|
2198
|
-
const __m256 d =
|
2273
|
+
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2199
2274
|
|
2200
2275
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
2201
2276
|
const __m128i off = _mm_set1_epi8(8);
|
@@ -2237,7 +2312,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2237
2312
|
_mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
|
2238
2313
|
|
2239
2314
|
// Compute combined scale for the block 0 and 1
|
2240
|
-
const __m128 d_0_1 =
|
2315
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
|
2241
2316
|
|
2242
2317
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
2243
2318
|
|
@@ -2255,7 +2330,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2255
2330
|
_mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
|
2256
2331
|
|
2257
2332
|
// Compute combined scale for the block 2 and 3
|
2258
|
-
const __m128 d_2_3 =
|
2333
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
|
2259
2334
|
|
2260
2335
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
2261
2336
|
|
@@ -2288,7 +2363,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2288
2363
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
2289
2364
|
|
2290
2365
|
// Compute combined scale for the block 0 and 1
|
2291
|
-
const __m128 d_0_1 =
|
2366
|
+
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
2292
2367
|
|
2293
2368
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
|
2294
2369
|
|
@@ -2306,7 +2381,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2306
2381
|
_mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
2307
2382
|
|
2308
2383
|
// Compute combined scale for the block 2 and 3
|
2309
|
-
const __m128 d_2_3 =
|
2384
|
+
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
|
2310
2385
|
|
2311
2386
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
|
2312
2387
|
|
@@ -2354,7 +2429,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2354
2429
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2355
2430
|
}
|
2356
2431
|
|
2357
|
-
sumf += (x[i].d*y[i].d)
|
2432
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2358
2433
|
}
|
2359
2434
|
|
2360
2435
|
*s = sumf;
|
@@ -2384,7 +2459,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2384
2459
|
const block_q8_1 * restrict y0 = &y[i + 0];
|
2385
2460
|
const block_q8_1 * restrict y1 = &y[i + 1];
|
2386
2461
|
|
2387
|
-
summs += x0->m * y0->s + x1->m * y1->s;
|
2462
|
+
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
2388
2463
|
|
2389
2464
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
2390
2465
|
|
@@ -2408,8 +2483,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2408
2483
|
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
2409
2484
|
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
2410
2485
|
|
2411
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
|
2412
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
|
2486
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2487
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2413
2488
|
#else
|
2414
2489
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
2415
2490
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
@@ -2426,8 +2501,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2426
2501
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2427
2502
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2428
2503
|
|
2429
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
|
2430
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
|
2504
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2505
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2431
2506
|
#endif
|
2432
2507
|
}
|
2433
2508
|
|
@@ -2440,13 +2515,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2440
2515
|
|
2441
2516
|
// Main loop
|
2442
2517
|
for (int i = 0; i < nb; ++i) {
|
2443
|
-
const float
|
2444
|
-
const float
|
2518
|
+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
2519
|
+
const float d1 = y[i].d;
|
2445
2520
|
|
2446
|
-
summs += x[i].m * y[i].s;
|
2521
|
+
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
2447
2522
|
|
2448
|
-
const __m256 d0v =
|
2449
|
-
const __m256 d1v =
|
2523
|
+
const __m256 d0v = _mm256_set1_ps( d0 );
|
2524
|
+
const __m256 d1v = _mm256_set1_ps( d1 );
|
2450
2525
|
|
2451
2526
|
// Compute combined scales
|
2452
2527
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
@@ -2480,7 +2555,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2480
2555
|
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
2481
2556
|
}
|
2482
2557
|
|
2483
|
-
sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
|
2558
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2484
2559
|
}
|
2485
2560
|
|
2486
2561
|
*s = sumf;
|
@@ -2556,16 +2631,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2556
2631
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2557
2632
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2558
2633
|
|
2559
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2560
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2561
|
-
|
2562
2634
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2563
2635
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2564
2636
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2565
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2637
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2566
2638
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2567
2639
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2568
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2640
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2569
2641
|
#else
|
2570
2642
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2571
2643
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2582,8 +2654,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2582
2654
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2583
2655
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2584
2656
|
|
2585
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2586
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2657
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
2658
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
2587
2659
|
#endif
|
2588
2660
|
}
|
2589
2661
|
|
@@ -2600,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2600
2672
|
const block_q8_0 * restrict y0 = &y[i];
|
2601
2673
|
|
2602
2674
|
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
2603
|
-
const v128_t s16b = wasm_i8x16_splat(0x10);
|
2604
2675
|
|
2605
2676
|
// extract the 5th bit
|
2606
2677
|
memcpy(&qh, x0->qh, sizeof(qh));
|
@@ -2638,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2638
2709
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2639
2710
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2640
2711
|
|
2641
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2642
|
-
|
2643
2712
|
// dot product
|
2644
2713
|
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
2645
2714
|
wasm_i32x4_add(
|
2646
2715
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2647
2716
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2648
2717
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2649
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2718
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2719
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
2650
2720
|
}
|
2651
2721
|
|
2652
2722
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2658,7 +2728,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2658
2728
|
// Main loop
|
2659
2729
|
for (int i = 0; i < nb; i++) {
|
2660
2730
|
/* Compute combined scale for the block */
|
2661
|
-
const __m256 d =
|
2731
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2662
2732
|
|
2663
2733
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2664
2734
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2682,7 +2752,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2682
2752
|
// Main loop
|
2683
2753
|
for (int i = 0; i < nb; i++) {
|
2684
2754
|
/* Compute combined scale for the block */
|
2685
|
-
const __m256 d =
|
2755
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
2686
2756
|
|
2687
2757
|
__m256i bx = bytes_from_nibbles_32(x[i].qs);
|
2688
2758
|
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
@@ -2725,7 +2795,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2725
2795
|
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
2726
2796
|
}
|
2727
2797
|
|
2728
|
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
|
2798
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
2729
2799
|
}
|
2730
2800
|
|
2731
2801
|
*s = sumf;
|
@@ -2807,16 +2877,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2807
2877
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
2808
2878
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
2809
2879
|
|
2810
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2811
|
-
const float x1d = GGML_FP16_TO_FP32(x1->d);
|
2812
|
-
|
2813
2880
|
#if defined(__ARM_FEATURE_DOTPROD)
|
2814
2881
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
2815
2882
|
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
2816
|
-
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))),
|
2883
|
+
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2817
2884
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
2818
2885
|
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
2819
|
-
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))),
|
2886
|
+
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2820
2887
|
#else
|
2821
2888
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
2822
2889
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
@@ -2833,8 +2900,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2833
2900
|
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
2834
2901
|
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
2835
2902
|
|
2836
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)),
|
2837
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)),
|
2903
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
2904
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
2838
2905
|
#endif
|
2839
2906
|
}
|
2840
2907
|
|
@@ -2873,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2873
2940
|
const v128_t v0l = wasm_v128_and (v0, m4b);
|
2874
2941
|
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
2875
2942
|
|
2876
|
-
static bool x = true;
|
2877
|
-
|
2878
2943
|
// add high bit
|
2879
2944
|
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
2880
2945
|
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
@@ -2894,15 +2959,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2894
2959
|
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
2895
2960
|
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
2896
2961
|
|
2897
|
-
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
2898
|
-
|
2899
2962
|
// dot product
|
2900
|
-
sumv = wasm_f32x4_add(sumv,
|
2901
|
-
|
2963
|
+
sumv = wasm_f32x4_add(sumv,
|
2964
|
+
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
2902
2965
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
2903
2966
|
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
2904
2967
|
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
2905
|
-
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2968
|
+
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
2969
|
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
2906
2970
|
}
|
2907
2971
|
|
2908
2972
|
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
@@ -2924,7 +2988,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2924
2988
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
2925
2989
|
bx = _mm256_or_si256(bx, bxhi);
|
2926
2990
|
|
2927
|
-
const __m256 dy =
|
2991
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2928
2992
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2929
2993
|
|
2930
2994
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -2958,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2958
3022
|
bxh = _mm_or_si128(bxh, bxhih);
|
2959
3023
|
bx = _mm256_set_m128i(bxh, bxl);
|
2960
3024
|
|
2961
|
-
const __m256 dy =
|
3025
|
+
const __m256 dy = _mm256_set1_ps(y[i].d);
|
2962
3026
|
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
2963
3027
|
|
2964
3028
|
const __m256 q = mul_sum_us8_pairs_float(bx, by);
|
@@ -3028,11 +3092,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3028
3092
|
#if defined(__ARM_FEATURE_DOTPROD)
|
3029
3093
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
3030
3094
|
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
3031
|
-
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
|
3095
|
+
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3032
3096
|
|
3033
3097
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
3034
3098
|
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
3035
|
-
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
|
3099
|
+
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3036
3100
|
|
3037
3101
|
#else
|
3038
3102
|
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
@@ -3050,8 +3114,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3050
3114
|
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
3051
3115
|
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
3052
3116
|
|
3053
|
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
|
3054
|
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
|
3117
|
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
3118
|
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
3055
3119
|
#endif
|
3056
3120
|
}
|
3057
3121
|
|
@@ -3063,7 +3127,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3063
3127
|
// Main loop
|
3064
3128
|
for (int i = 0; i < nb; ++i) {
|
3065
3129
|
// Compute combined scale for the block
|
3066
|
-
const __m256 d =
|
3130
|
+
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
3067
3131
|
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
3068
3132
|
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3069
3133
|
|
@@ -3089,7 +3153,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3089
3153
|
sumi += x[i].qs[j]*y[i].qs[j];
|
3090
3154
|
}
|
3091
3155
|
|
3092
|
-
sumf += (x[i].d*y[i].d)
|
3156
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3093
3157
|
}
|
3094
3158
|
|
3095
3159
|
*s = sumf;
|
@@ -3478,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3478
3542
|
"ROPE",
|
3479
3543
|
"ROPE_BACK",
|
3480
3544
|
"ALIBI",
|
3545
|
+
"CLAMP",
|
3481
3546
|
"CONV_1D_1S",
|
3482
3547
|
"CONV_1D_2S",
|
3483
3548
|
|
@@ -3488,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|
3488
3553
|
"MAP_BINARY",
|
3489
3554
|
};
|
3490
3555
|
|
3491
|
-
static_assert(GGML_OP_COUNT ==
|
3556
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3557
|
+
|
3492
3558
|
|
3493
3559
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3494
3560
|
"none",
|
@@ -3538,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3538
3604
|
"rope(x)",
|
3539
3605
|
"rope_back(x)",
|
3540
3606
|
"alibi(x)",
|
3607
|
+
"clamp(x)",
|
3541
3608
|
"conv_1d_1s(x)",
|
3542
3609
|
"conv_1d_2s(x)",
|
3543
3610
|
|
@@ -3548,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3548
3615
|
"f(x,y)",
|
3549
3616
|
};
|
3550
3617
|
|
3551
|
-
static_assert(GGML_OP_COUNT ==
|
3618
|
+
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
3552
3619
|
|
3553
3620
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3554
3621
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3782,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
|
|
3782
3849
|
(t1->ne[3]%t0->ne[3] == 0);
|
3783
3850
|
}
|
3784
3851
|
|
3852
|
+
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3853
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3854
|
+
|
3855
|
+
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
|
3856
|
+
}
|
3857
|
+
|
3785
3858
|
static inline int ggml_up32(int n) {
|
3786
3859
|
return (n + 31) & ~31;
|
3787
3860
|
}
|
@@ -4664,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
|
|
4664
4737
|
struct ggml_tensor * a,
|
4665
4738
|
struct ggml_tensor * b,
|
4666
4739
|
bool inplace) {
|
4667
|
-
|
4740
|
+
// TODO: support less-strict constraint
|
4741
|
+
// GGML_ASSERT(ggml_can_repeat(b, a));
|
4742
|
+
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
4668
4743
|
|
4669
4744
|
bool is_node = false;
|
4670
4745
|
|
4671
4746
|
if (!inplace && (a->grad || b->grad)) {
|
4747
|
+
// TODO: support backward pass for broadcasting
|
4748
|
+
GGML_ASSERT(ggml_are_same_shape(a, b));
|
4672
4749
|
is_node = true;
|
4673
4750
|
}
|
4674
4751
|
|
@@ -6210,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6210
6287
|
struct ggml_context * ctx,
|
6211
6288
|
struct ggml_tensor * a,
|
6212
6289
|
int n_past,
|
6213
|
-
int n_head
|
6290
|
+
int n_head,
|
6291
|
+
float bias_max) {
|
6214
6292
|
GGML_ASSERT(n_past >= 0);
|
6215
6293
|
bool is_node = false;
|
6216
6294
|
|
@@ -6229,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
|
|
6229
6307
|
|
6230
6308
|
((int32_t *) b->data)[0] = n_past;
|
6231
6309
|
((int32_t *) b->data)[1] = n_head;
|
6310
|
+
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
|
6311
|
+
(((float *) b->data)[2]) = bias_max;
|
6232
6312
|
|
6233
6313
|
ggml_scratch_load(ctx);
|
6234
6314
|
|
@@ -6240,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
|
|
6240
6320
|
return result;
|
6241
6321
|
}
|
6242
6322
|
|
6323
|
+
// ggml_clamp
|
6324
|
+
|
6325
|
+
struct ggml_tensor * ggml_clamp(
|
6326
|
+
struct ggml_context * ctx,
|
6327
|
+
struct ggml_tensor * a,
|
6328
|
+
float min,
|
6329
|
+
float max) {
|
6330
|
+
bool is_node = false;
|
6331
|
+
|
6332
|
+
if (a->grad) {
|
6333
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6334
|
+
is_node = true;
|
6335
|
+
}
|
6336
|
+
|
6337
|
+
// TODO: when implement backward, fix this:
|
6338
|
+
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6339
|
+
|
6340
|
+
ggml_scratch_save(ctx);
|
6341
|
+
|
6342
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6343
|
+
|
6344
|
+
((float *) b->data)[0] = min;
|
6345
|
+
((float *) b->data)[1] = max;
|
6346
|
+
|
6347
|
+
ggml_scratch_load(ctx);
|
6348
|
+
|
6349
|
+
result->op = GGML_OP_CLAMP;
|
6350
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6351
|
+
result->src0 = a;
|
6352
|
+
result->src1 = b;
|
6353
|
+
|
6354
|
+
return result;
|
6355
|
+
}
|
6356
|
+
|
6243
6357
|
// ggml_conv_1d_1s
|
6244
6358
|
|
6245
6359
|
struct ggml_tensor * ggml_conv_1d_1s(
|
@@ -7966,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
|
|
7966
8080
|
const struct ggml_tensor * src0,
|
7967
8081
|
const struct ggml_tensor * src1,
|
7968
8082
|
struct ggml_tensor * dst) {
|
7969
|
-
|
8083
|
+
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
7970
8084
|
|
7971
8085
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
7972
8086
|
return;
|
@@ -7974,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
|
|
7974
8088
|
const int ith = params->ith;
|
7975
8089
|
const int nth = params->nth;
|
7976
8090
|
|
7977
|
-
|
7978
|
-
|
7979
|
-
|
7980
|
-
|
8091
|
+
#ifdef GGML_USE_CUBLAS
|
8092
|
+
if (src1->backend == GGML_BACKEND_CUDA) {
|
8093
|
+
if (ith == 0) {
|
8094
|
+
ggml_cuda_mul(src0, src1, dst);
|
8095
|
+
}
|
8096
|
+
return;
|
8097
|
+
}
|
8098
|
+
#endif
|
8099
|
+
|
8100
|
+
const int64_t nr = ggml_nrows(src0);
|
8101
|
+
|
8102
|
+
const int64_t ne00 = src0->ne[0];
|
8103
|
+
const int64_t ne01 = src0->ne[1];
|
8104
|
+
const int64_t ne02 = src0->ne[2];
|
8105
|
+
|
8106
|
+
const int64_t ne10 = src1->ne[0];
|
8107
|
+
const int64_t ne11 = src1->ne[1];
|
8108
|
+
const int64_t ne12 = src1->ne[2];
|
8109
|
+
const int64_t ne13 = src1->ne[3];
|
7981
8110
|
|
7982
8111
|
const size_t nb00 = src0->nb[0];
|
7983
8112
|
const size_t nb01 = src0->nb[1];
|
@@ -7996,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
|
|
7996
8125
|
|
7997
8126
|
GGML_ASSERT( nb0 == sizeof(float));
|
7998
8127
|
GGML_ASSERT(nb00 == sizeof(float));
|
8128
|
+
GGML_ASSERT(ne00 == ne10);
|
7999
8129
|
|
8000
8130
|
if (nb10 == sizeof(float)) {
|
8001
|
-
for (
|
8002
|
-
// src0
|
8003
|
-
const
|
8004
|
-
const
|
8005
|
-
const
|
8131
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8132
|
+
// src0 and dst are same shape => same indices
|
8133
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8134
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8135
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8136
|
+
|
8137
|
+
const int64_t i13 = i03 % ne13;
|
8138
|
+
const int64_t i12 = i02 % ne12;
|
8139
|
+
const int64_t i11 = i01 % ne11;
|
8006
8140
|
|
8141
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8142
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8143
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
8007
8144
|
|
8008
8145
|
#ifdef GGML_USE_ACCELERATE
|
8009
8146
|
UNUSED(ggml_vec_mul_f32);
|
8010
8147
|
|
8011
|
-
vDSP_vmul(
|
8012
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
8013
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
8014
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
8015
|
-
ne0);
|
8148
|
+
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
8016
8149
|
#else
|
8017
|
-
ggml_vec_mul_f32(
|
8018
|
-
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
8019
|
-
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
8020
|
-
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
8150
|
+
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
8021
8151
|
#endif
|
8022
8152
|
// }
|
8023
8153
|
// }
|
8024
8154
|
}
|
8025
8155
|
} else {
|
8026
8156
|
// src1 is not contiguous
|
8027
|
-
for (
|
8028
|
-
// src0
|
8029
|
-
|
8030
|
-
const
|
8031
|
-
const
|
8157
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
8158
|
+
// src0 and dst are same shape => same indices
|
8159
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
8160
|
+
const int64_t i03 = ir/(ne02*ne01);
|
8161
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
8162
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
8032
8163
|
|
8033
|
-
|
8034
|
-
|
8035
|
-
|
8036
|
-
|
8164
|
+
const int64_t i13 = i03 % ne13;
|
8165
|
+
const int64_t i12 = i02 % ne12;
|
8166
|
+
const int64_t i11 = i01 % ne11;
|
8167
|
+
|
8168
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
8169
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
8170
|
+
|
8171
|
+
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
8172
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
8037
8173
|
|
8038
8174
|
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
8039
8175
|
}
|
@@ -10527,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
10527
10663
|
|
10528
10664
|
const int n_past = ((int32_t *) src1->data)[0];
|
10529
10665
|
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
10666
|
+
|
10530
10667
|
assert(n_past >= 0);
|
10531
10668
|
|
10532
10669
|
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
@@ -10697,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
|
|
10697
10834
|
struct ggml_tensor * dst) {
|
10698
10835
|
assert(params->ith == 0);
|
10699
10836
|
assert(src1->type == GGML_TYPE_I32);
|
10700
|
-
assert(ggml_nelements(src1) ==
|
10837
|
+
assert(ggml_nelements(src1) == 3);
|
10701
10838
|
|
10702
10839
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10703
10840
|
return;
|
10704
10841
|
}
|
10705
10842
|
|
10706
|
-
const int
|
10707
|
-
const int
|
10843
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10844
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10845
|
+
const float max_bias = ((float *) src1->data)[2];
|
10708
10846
|
|
10709
10847
|
assert(n_past >= 0);
|
10710
10848
|
|
@@ -10727,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
|
|
10727
10865
|
// add alibi to src0 (KQ_scaled)
|
10728
10866
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10729
10867
|
|
10730
|
-
const float m0 = powf(2.0f, -
|
10731
|
-
const float m1 = powf(2.0f, -
|
10868
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10869
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10732
10870
|
|
10733
10871
|
for (int i = 0; i < ne0; i++) {
|
10734
10872
|
for (int j = 0; j < ne1; j++) {
|
@@ -10746,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
|
|
10746
10884
|
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
10747
10885
|
}
|
10748
10886
|
|
10749
|
-
pdst[0] = i * m_k + src[0];
|
10887
|
+
pdst[0] = (i-ne0+1) * m_k + src[0];
|
10888
|
+
|
10750
10889
|
}
|
10751
10890
|
}
|
10752
10891
|
}
|
10753
10892
|
}
|
10754
10893
|
|
10755
|
-
|
10756
10894
|
static void ggml_compute_forward_alibi_f16(
|
10757
10895
|
const struct ggml_compute_params * params,
|
10758
10896
|
const struct ggml_tensor * src0,
|
@@ -10760,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
|
|
10760
10898
|
struct ggml_tensor * dst) {
|
10761
10899
|
assert(params->ith == 0);
|
10762
10900
|
assert(src1->type == GGML_TYPE_I32);
|
10763
|
-
assert(ggml_nelements(src1) ==
|
10901
|
+
assert(ggml_nelements(src1) == 3);
|
10764
10902
|
|
10765
10903
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10766
10904
|
return;
|
10767
10905
|
}
|
10768
10906
|
|
10769
|
-
const int
|
10770
|
-
const int
|
10907
|
+
const int n_past = ((int32_t *) src1->data)[0];
|
10908
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
10909
|
+
const float max_bias = ((float *) src1->data)[2];
|
10771
10910
|
|
10772
10911
|
assert(n_past >= 0);
|
10773
10912
|
|
@@ -10790,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
|
|
10790
10929
|
// add alibi to src0 (KQ_scaled)
|
10791
10930
|
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
10792
10931
|
|
10793
|
-
const float m0 = powf(2.0f, -
|
10794
|
-
const float m1 = powf(2.0f, -
|
10932
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
10933
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
10795
10934
|
|
10796
10935
|
for (int i = 0; i < ne0; i++) {
|
10797
10936
|
for (int j = 0; j < ne1; j++) {
|
@@ -10810,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
|
|
10810
10949
|
}
|
10811
10950
|
|
10812
10951
|
// we return F32
|
10813
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
10952
|
+
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
10814
10953
|
}
|
10815
10954
|
}
|
10816
10955
|
}
|
@@ -10846,6 +10985,77 @@ static void ggml_compute_forward_alibi(
|
|
10846
10985
|
}
|
10847
10986
|
}
|
10848
10987
|
|
10988
|
+
|
10989
|
+
// ggml_compute_forward_clamp
|
10990
|
+
|
10991
|
+
static void ggml_compute_forward_clamp_f32(
|
10992
|
+
const struct ggml_compute_params * params,
|
10993
|
+
const struct ggml_tensor * src0,
|
10994
|
+
const struct ggml_tensor * src1,
|
10995
|
+
struct ggml_tensor * dst) {
|
10996
|
+
assert(params->ith == 0);
|
10997
|
+
assert(src1->type == GGML_TYPE_I32);
|
10998
|
+
assert(ggml_nelements(src1) == 2);
|
10999
|
+
|
11000
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11001
|
+
return;
|
11002
|
+
}
|
11003
|
+
|
11004
|
+
const int min = ((float *) src1->data)[0];
|
11005
|
+
const int max = ((float *) src1->data)[1];
|
11006
|
+
|
11007
|
+
const int ith = params->ith;
|
11008
|
+
const int nth = params->nth;
|
11009
|
+
|
11010
|
+
const int n = ggml_nrows(src0);
|
11011
|
+
const int nc = src0->ne[0];
|
11012
|
+
|
11013
|
+
const size_t nb00 = src0->nb[0];
|
11014
|
+
const size_t nb01 = src0->nb[1];
|
11015
|
+
|
11016
|
+
const size_t nb0 = dst->nb[0];
|
11017
|
+
const size_t nb1 = dst->nb[1];
|
11018
|
+
|
11019
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
11020
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
11021
|
+
|
11022
|
+
for (int j = ith; j < n; j += nth) {
|
11023
|
+
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
11024
|
+
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
11025
|
+
|
11026
|
+
for (int i = 0; i < nc; i++) {
|
11027
|
+
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
11028
|
+
}
|
11029
|
+
}
|
11030
|
+
}
|
11031
|
+
|
11032
|
+
static void ggml_compute_forward_clamp(
|
11033
|
+
const struct ggml_compute_params * params,
|
11034
|
+
const struct ggml_tensor * src0,
|
11035
|
+
const struct ggml_tensor * src1,
|
11036
|
+
struct ggml_tensor * dst) {
|
11037
|
+
switch (src0->type) {
|
11038
|
+
case GGML_TYPE_F32:
|
11039
|
+
{
|
11040
|
+
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
|
11041
|
+
} break;
|
11042
|
+
case GGML_TYPE_F16:
|
11043
|
+
case GGML_TYPE_Q4_0:
|
11044
|
+
case GGML_TYPE_Q4_1:
|
11045
|
+
case GGML_TYPE_Q5_0:
|
11046
|
+
case GGML_TYPE_Q5_1:
|
11047
|
+
case GGML_TYPE_Q8_0:
|
11048
|
+
case GGML_TYPE_Q8_1:
|
11049
|
+
case GGML_TYPE_I8:
|
11050
|
+
case GGML_TYPE_I16:
|
11051
|
+
case GGML_TYPE_I32:
|
11052
|
+
case GGML_TYPE_COUNT:
|
11053
|
+
{
|
11054
|
+
GGML_ASSERT(false);
|
11055
|
+
} break;
|
11056
|
+
}
|
11057
|
+
}
|
11058
|
+
|
10849
11059
|
// ggml_compute_forward_rope
|
10850
11060
|
|
10851
11061
|
static void ggml_compute_forward_rope_f32(
|
@@ -12827,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
12827
13037
|
{
|
12828
13038
|
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
12829
13039
|
} break;
|
13040
|
+
case GGML_OP_CLAMP:
|
13041
|
+
{
|
13042
|
+
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
13043
|
+
} break;
|
12830
13044
|
case GGML_OP_CONV_1D_1S:
|
12831
13045
|
{
|
12832
13046
|
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
@@ -13134,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
13134
13348
|
{
|
13135
13349
|
GGML_ASSERT(false); // TODO: not implemented
|
13136
13350
|
} break;
|
13351
|
+
case GGML_OP_CLAMP:
|
13352
|
+
{
|
13353
|
+
GGML_ASSERT(false); // TODO: not implemented
|
13354
|
+
} break;
|
13137
13355
|
case GGML_OP_SILU:
|
13138
13356
|
{
|
13139
13357
|
// necessary for llama
|
@@ -14013,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
14013
14231
|
{
|
14014
14232
|
node->n_tasks = 1; //TODO
|
14015
14233
|
} break;
|
14234
|
+
case GGML_OP_CLAMP:
|
14235
|
+
{
|
14236
|
+
node->n_tasks = 1; //TODO
|
14237
|
+
} break;
|
14016
14238
|
case GGML_OP_CONV_1D_1S:
|
14017
14239
|
case GGML_OP_CONV_1D_2S:
|
14018
14240
|
{
|
@@ -14409,9 +14631,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
14409
14631
|
fprintf(fp, "%s |", node->name);
|
14410
14632
|
}
|
14411
14633
|
|
14412
|
-
|
14413
|
-
|
14414
|
-
|
14634
|
+
if (node->n_dims == 2) {
|
14635
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
|
14636
|
+
} else {
|
14637
|
+
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
14638
|
+
}
|
14639
|
+
|
14415
14640
|
|
14416
14641
|
if (node->grad) {
|
14417
14642
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|