llama_cpp 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
@@ -220,7 +220,7 @@ typedef struct {
|
|
220
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
221
221
|
|
222
222
|
#define WARP_SIZE 32
|
223
|
-
#define MATRIX_ROW_PADDING
|
223
|
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
224
224
|
|
225
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
226
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
@@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
332
332
|
}
|
333
333
|
}
|
334
334
|
|
335
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
335
|
+
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
336
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
337
337
|
const int tid = threadIdx.x;
|
338
338
|
|
339
|
-
const float eps = 1e-6f;
|
340
|
-
|
341
339
|
float tmp = 0.0f; // partial sum for thread in warp
|
342
340
|
|
343
341
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
@@ -935,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
935
933
|
uint16_t aux[4];
|
936
934
|
const uint8_t * sc = (const uint8_t *)aux;
|
937
935
|
|
936
|
+
#if K_QUANTS_PER_ITERATION == 2
|
937
|
+
uint32_t q32[4];
|
938
|
+
const uint8_t * q4 = (const uint8_t *)q32;
|
939
|
+
#else
|
940
|
+
uint16_t q16[4];
|
941
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
942
|
+
#endif
|
943
|
+
|
938
944
|
float tmp = 0; // partial sum for thread in warp
|
939
945
|
|
940
946
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
941
947
|
|
942
|
-
const uint8_t * q1 = x[i].qs + q_offset;
|
943
|
-
const uint8_t * q2 = q1 + 64;
|
944
948
|
const float * y1 = yy + i*QK_K + y_offset;
|
945
949
|
const float * y2 = y1 + 128;
|
946
950
|
|
@@ -953,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
953
957
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
954
958
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
955
959
|
|
960
|
+
#if K_QUANTS_PER_ITERATION == 2
|
961
|
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
962
|
+
const uint32_t * q2 = q1 + 16;
|
963
|
+
|
964
|
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
965
|
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
966
|
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
967
|
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
968
|
+
|
956
969
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
957
970
|
float smin = 0;
|
958
|
-
for (int l = 0; l <
|
959
|
-
s.x += y1[l] *
|
960
|
-
s.z += y2[l] *
|
971
|
+
for (int l = 0; l < 4; ++l) {
|
972
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
973
|
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
961
974
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
962
975
|
}
|
963
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
976
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
977
|
+
#else
|
978
|
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
979
|
+
const uint16_t * q2 = q1 + 32;
|
980
|
+
|
981
|
+
q16[0] = q1[0] & 0x0f0f;
|
982
|
+
q16[1] = q1[0] & 0xf0f0;
|
983
|
+
q16[2] = q2[0] & 0x0f0f;
|
984
|
+
q16[3] = q2[0] & 0xf0f0;
|
985
|
+
|
986
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
987
|
+
float smin = 0;
|
988
|
+
for (int l = 0; l < 2; ++l) {
|
989
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
990
|
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
991
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
992
|
+
}
|
993
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
994
|
+
#endif
|
964
995
|
|
965
996
|
}
|
966
997
|
#else
|
@@ -1040,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1040
1071
|
uint16_t aux[4];
|
1041
1072
|
const uint8_t * sc = (const uint8_t *)aux;
|
1042
1073
|
|
1074
|
+
uint16_t q16[8];
|
1075
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
1076
|
+
|
1043
1077
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
1044
1078
|
|
1045
1079
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
1046
|
-
const uint8_t * ql2 = ql1 + 64;
|
1047
1080
|
const uint8_t * qh = x[i].qh + l0;
|
1048
1081
|
const float * y1 = yy + i*QK_K + y_offset;
|
1049
1082
|
const float * y2 = y1 + 128;
|
@@ -1059,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1059
1092
|
|
1060
1093
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
1061
1094
|
float smin = 0;
|
1095
|
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
1096
|
+
const uint16_t * q2 = q1 + 32;
|
1097
|
+
q16[0] = q1[0] & 0x0f0f;
|
1098
|
+
q16[1] = q1[8] & 0x0f0f;
|
1099
|
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
1100
|
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
1101
|
+
q16[4] = q2[0] & 0x0f0f;
|
1102
|
+
q16[5] = q2[8] & 0x0f0f;
|
1103
|
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
1104
|
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
1062
1105
|
for (int l = 0; l < n; ++l) {
|
1063
|
-
sum.x += y1[l+ 0] * (
|
1064
|
-
+ y1[l+16] * (
|
1065
|
-
sum.y += y1[l+32] * (
|
1066
|
-
+ y1[l+48] * (
|
1067
|
-
sum.z += y2[l+ 0] * (
|
1068
|
-
+ y2[l+16] * (
|
1069
|
-
sum.w += y2[l+32] * (
|
1070
|
-
+ y2[l+48] * (
|
1106
|
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
1107
|
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
1108
|
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
1109
|
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
1110
|
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
1111
|
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
1112
|
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
1113
|
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
1071
1114
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
1072
1115
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
1073
1116
|
}
|
@@ -1521,33 +1564,95 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
1521
1564
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
1565
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
1566
|
|
1524
|
-
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
-
|
1526
1567
|
float sumf_d = 0.0f;
|
1527
1568
|
float sumf_m = 0.0f;
|
1528
1569
|
|
1570
|
+
#ifndef GGML_QKK_64
|
1571
|
+
|
1572
|
+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
|
1573
|
+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
|
1574
|
+
|
1529
1575
|
const float d = bq4_K->d;
|
1530
1576
|
const float dmin = bq4_K->dmin;
|
1531
1577
|
|
1532
|
-
|
1578
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
1579
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
1580
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1582
|
+
|
1583
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1584
|
+
const int v1 = q4[0];
|
1585
|
+
const int v2 = q4[4];
|
1586
|
+
|
1587
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
1588
|
+
uint16_t aux[2];
|
1589
|
+
const int j = bq8_offset/2;
|
1590
|
+
if (j < 2) {
|
1591
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1592
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1593
|
+
} else {
|
1594
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1595
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1596
|
+
}
|
1597
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
+
const uint8_t * m = sc + 2;
|
1533
1599
|
|
1534
1600
|
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
-
const int isc = bq8_offset + i;
|
1536
|
-
|
1537
|
-
uint8_t sc, m;
|
1538
|
-
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
1601
|
|
1540
1602
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
1603
|
const float d8i = bq8i->d;
|
1604
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
+
const int ui1 = q8[0];
|
1606
|
+
const int ui2 = q8[4];
|
1607
|
+
|
1608
|
+
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
1609
|
+
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
1543
1610
|
|
1544
|
-
const int
|
1611
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1612
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1545
1613
|
|
1546
|
-
sumf_d += d8i * (
|
1547
|
-
sumf_m += d8i * (
|
1614
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1615
|
+
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1548
1616
|
}
|
1549
1617
|
|
1550
1618
|
return d*sumf_d - dmin*sumf_m;
|
1619
|
+
|
1620
|
+
#else
|
1621
|
+
|
1622
|
+
uint16_t aux16[2];
|
1623
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
1624
|
+
|
1625
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
1626
|
+
aux16[0] = a[0] & 0x0f0f;
|
1627
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1628
|
+
|
1629
|
+
const float dall = bq4_K->d[0];
|
1630
|
+
const float dmin = bq4_K->d[1];
|
1631
|
+
|
1632
|
+
const float d8_1 = bq8_1[0].d;
|
1633
|
+
const float d8_2 = bq8_1[1].d;
|
1634
|
+
|
1635
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1636
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1637
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1639
|
+
|
1640
|
+
const int * q4 = (const int *)bq4_K->qs + iqs;
|
1641
|
+
const int v1 = q4[0];
|
1642
|
+
const int v2 = q4[4];
|
1643
|
+
|
1644
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
1645
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
1646
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1647
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
1648
|
+
|
1649
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
1650
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
1651
|
+
|
1652
|
+
return dall * sumf_d - dmin * sumf_m;
|
1653
|
+
|
1654
|
+
#endif
|
1655
|
+
|
1551
1656
|
#else
|
1552
1657
|
return 0.0f; // only to satisfy the compiler
|
1553
1658
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1559,7 +1664,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
1559
1664
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
1665
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
1666
|
|
1562
|
-
|
1667
|
+
#ifndef GGML_QKK_64
|
1668
|
+
|
1669
|
+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
|
1670
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1671
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
1563
1672
|
|
1564
1673
|
float sumf_d = 0.0f;
|
1565
1674
|
float sumf_m = 0.0f;
|
@@ -1567,31 +1676,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
1567
1676
|
const float d = bq5_K->d;
|
1568
1677
|
const float dmin = bq5_K->dmin;
|
1569
1678
|
|
1570
|
-
const int
|
1679
|
+
const int vl1 = ql[0];
|
1680
|
+
const int vl2 = ql[4];
|
1571
1681
|
|
1572
|
-
const int
|
1682
|
+
const int vh1 = qh[0] >> bq8_offset;
|
1683
|
+
const int vh2 = qh[4] >> bq8_offset;
|
1573
1684
|
|
1574
|
-
|
1575
|
-
|
1685
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
1686
|
+
uint16_t aux[2];
|
1687
|
+
const int j = bq8_offset/2;
|
1688
|
+
if (j < 2) {
|
1689
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1690
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1691
|
+
} else {
|
1692
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1693
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1694
|
+
}
|
1695
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1696
|
+
const uint8_t * m = sc + 2;
|
1576
1697
|
|
1577
|
-
|
1578
|
-
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1698
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1579
1699
|
|
1580
1700
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
1701
|
const float d8i = bq8i->d;
|
1702
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1703
|
+
const int ui1 = q8[0];
|
1704
|
+
const int ui2 = q8[4];
|
1583
1705
|
|
1584
|
-
const int
|
1706
|
+
const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
|
1707
|
+
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
1708
|
+
|
1709
|
+
const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
|
1710
|
+
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
1711
|
+
|
1712
|
+
const int vi1 = vil1 | vih1;
|
1713
|
+
const int vi2 = vil2 | vih2;
|
1585
1714
|
|
1586
|
-
const int
|
1715
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1716
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1587
1717
|
|
1588
|
-
|
1718
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1719
|
+
sumf_m += d8i * (dot2 * m[i]);
|
1589
1720
|
|
1590
|
-
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
-
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
1721
|
}
|
1593
1722
|
|
1594
1723
|
return d*sumf_d - dmin*sumf_m;
|
1724
|
+
|
1725
|
+
#else
|
1726
|
+
|
1727
|
+
const int8_t * s = bq5_K->scales;
|
1728
|
+
|
1729
|
+
const float d = bq5_K->d;
|
1730
|
+
|
1731
|
+
const float d8_1 = bq8_1[0].d;
|
1732
|
+
const float d8_2 = bq8_1[1].d;
|
1733
|
+
|
1734
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1735
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1738
|
+
|
1739
|
+
const int * ql = (const int *)bq5_K->qs + iqs;
|
1740
|
+
const int vl1 = ql[0];
|
1741
|
+
const int vl2 = ql[4];
|
1742
|
+
|
1743
|
+
const int step = 4 * iqs; // 0, 4, 8, 12
|
1744
|
+
const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
|
1745
|
+
const int in = step%8; // 0, 4, 0, 4
|
1746
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
1747
|
+
|
1748
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
1749
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
1750
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
1752
|
+
|
1753
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
1754
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
1755
|
+
|
1756
|
+
return d * sumf_d;
|
1757
|
+
|
1758
|
+
#endif
|
1759
|
+
|
1595
1760
|
#else
|
1596
1761
|
return 0.0f; // only to satisfy the compiler
|
1597
1762
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1745,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1745
1910
|
}
|
1746
1911
|
}
|
1747
1912
|
|
1748
|
-
static __global__ void mul_mat_p021_f16_f32(
|
1913
|
+
static __global__ void mul_mat_p021_f16_f32(
|
1914
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
1915
|
+
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
1916
|
+
|
1749
1917
|
const half * x = (const half *) vx;
|
1750
1918
|
|
1751
1919
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1752
1920
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1921
|
+
const int channel_x = channel / (nchannels_y / nchannels_x);
|
1753
1922
|
|
1754
1923
|
const int nrows_y = ncols_x;
|
1755
1924
|
const int nrows_dst = nrows_x;
|
@@ -1765,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1765
1934
|
}
|
1766
1935
|
|
1767
1936
|
// x is transposed and permuted
|
1768
|
-
const int ix = row_x*nchannels_x*ncols_x +
|
1937
|
+
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
1769
1938
|
const float xi = __half2float(x[ix]);
|
1770
1939
|
|
1771
1940
|
const int row_y = col_x;
|
@@ -1793,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1793
1962
|
|
1794
1963
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1795
1964
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1796
|
-
const int row_stride_x, const int channel_stride_x) {
|
1965
|
+
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
1797
1966
|
|
1798
1967
|
const half * x = (const half *) vx;
|
1799
1968
|
|
1800
1969
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1801
1970
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1971
|
+
const int channel_x = channel / channel_x_divisor;
|
1802
1972
|
|
1803
1973
|
const int nrows_y = ncols_x;
|
1804
1974
|
const int nrows_dst = nrows_x;
|
@@ -1815,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1815
1985
|
break;
|
1816
1986
|
}
|
1817
1987
|
|
1818
|
-
const int ix =
|
1988
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1819
1989
|
const float xi = __half2float(x[ix]);
|
1820
1990
|
|
1821
1991
|
const int row_y = col_x;
|
@@ -2027,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2027
2197
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2028
2198
|
}
|
2029
2199
|
|
2030
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2200
|
+
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
2031
2201
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
2032
2202
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2033
|
-
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2203
|
+
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2034
2204
|
}
|
2035
2205
|
|
2036
2206
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
@@ -2259,7 +2429,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2259
2429
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
2430
|
const dim3 block_nums(1, block_num_y, 1);
|
2261
2431
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
-
|
2432
|
+
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
|
2433
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
+
// is better amortized.
|
2435
|
+
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
2436
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
2437
|
}
|
2265
2438
|
|
@@ -2268,7 +2441,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2268
2441
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
2442
|
const dim3 block_nums(1, block_num_y, 1);
|
2270
2443
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
-
|
2444
|
+
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
|
2445
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
+
// is better amortized.
|
2447
|
+
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
2448
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
2449
|
}
|
2274
2450
|
|
@@ -2324,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2324
2500
|
}
|
2325
2501
|
}
|
2326
2502
|
|
2327
|
-
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2328
|
-
const
|
2503
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
|
+
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
2506
|
+
|
2507
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2329
2508
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2330
|
-
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
2509
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2331
2510
|
}
|
2332
2511
|
|
2333
2512
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
2334
2513
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
2335
|
-
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
2514
|
+
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2336
2515
|
|
2337
|
-
const dim3 block_nums(1, nrows_x,
|
2516
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2338
2517
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2339
2518
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
2340
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
2519
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2341
2520
|
}
|
2342
2521
|
|
2343
2522
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -2497,7 +2676,9 @@ static size_t g_scratch_offset = 0;
|
|
2497
2676
|
|
2498
2677
|
static int g_device_count = -1;
|
2499
2678
|
static int g_main_device = 0;
|
2679
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2500
2680
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
+
#endif
|
2501
2682
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
2502
2683
|
|
2503
2684
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -2520,7 +2701,9 @@ void ggml_init_cublas() {
|
|
2520
2701
|
g_tensor_split[id] = total_vram;
|
2521
2702
|
total_vram += prop.totalGlobalMem;
|
2522
2703
|
|
2704
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2523
2705
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
+
#endif
|
2524
2707
|
}
|
2525
2708
|
for (int id = 0; id < g_device_count; ++id) {
|
2526
2709
|
g_tensor_split[id] /= total_vram;
|
@@ -2688,6 +2871,7 @@ inline void ggml_cuda_op_mul(
|
|
2688
2871
|
(void) dst;
|
2689
2872
|
(void) src0_ddq_i;
|
2690
2873
|
(void) i02;
|
2874
|
+
(void) i1;
|
2691
2875
|
}
|
2692
2876
|
|
2693
2877
|
inline void ggml_cuda_op_gelu(
|
@@ -2767,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
2767
2951
|
const int64_t ne00 = src0->ne[0];
|
2768
2952
|
const int64_t i01_diff = i01_high - i01_low;
|
2769
2953
|
|
2954
|
+
float eps;
|
2955
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
2956
|
+
|
2770
2957
|
// compute
|
2771
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2958
|
+
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
2772
2959
|
|
2773
2960
|
(void) src1;
|
2774
2961
|
(void) dst;
|
@@ -2815,8 +3002,8 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2815
3002
|
#endif
|
2816
3003
|
|
2817
3004
|
if (use_mul_mat_vec_q) {
|
2818
|
-
int64_t padded_row_size = ne00
|
2819
|
-
|
3005
|
+
const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
3006
|
+
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
2820
3007
|
size_t as;
|
2821
3008
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2822
3009
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
@@ -2983,15 +3170,15 @@ inline void ggml_cuda_op_rope(
|
|
2983
3170
|
const int64_t ne00 = src0->ne[0];
|
2984
3171
|
const int64_t i01_diff = i01_high - i01_low;
|
2985
3172
|
|
2986
|
-
const int n_past = ((int32_t *)
|
2987
|
-
const int n_dims = ((int32_t *)
|
2988
|
-
const int mode = ((int32_t *)
|
2989
|
-
const int n_ctx = ((int32_t *)
|
2990
|
-
|
3173
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3174
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
3175
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
3176
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
2991
3177
|
// RoPE alteration for extended context
|
3178
|
+
|
2992
3179
|
float freq_base, freq_scale;
|
2993
|
-
memcpy(&freq_base, (int32_t *)
|
2994
|
-
memcpy(&freq_scale, (int32_t *)
|
3180
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
3181
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
2995
3182
|
|
2996
3183
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
3184
|
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
@@ -3007,6 +3194,7 @@ inline void ggml_cuda_op_rope(
|
|
3007
3194
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
3195
|
}
|
3009
3196
|
|
3197
|
+
(void) src1;
|
3010
3198
|
(void) dst;
|
3011
3199
|
(void) src0_ddq_i;
|
3012
3200
|
(void) src1_ddf_i;
|
@@ -3025,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
3025
3213
|
const int64_t ne01 = src0->ne[1];
|
3026
3214
|
const int64_t i01_diff = i01_high - i01_low;
|
3027
3215
|
|
3028
|
-
const int n_past = ((int32_t *)
|
3216
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3029
3217
|
|
3030
3218
|
// compute
|
3031
3219
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
3032
3220
|
|
3221
|
+
(void) src1;
|
3033
3222
|
(void) dst;
|
3034
3223
|
(void) src0_ddq_i;
|
3035
3224
|
(void) src1_ddf_i;
|
@@ -3097,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3097
3286
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
3098
3287
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
3099
3288
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
3289
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
3290
|
+
|
3291
|
+
GGML_ASSERT(ne03 == ne13);
|
3100
3292
|
|
3101
3293
|
const int64_t ne0 = dst->ne[0];
|
3102
3294
|
const int64_t ne1 = dst->ne[1];
|
@@ -3108,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3108
3300
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
3109
3301
|
|
3110
3302
|
// strides for iteration over dims 3 and 2
|
3111
|
-
const int64_t
|
3112
|
-
const int64_t
|
3303
|
+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
3304
|
+
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
3305
|
+
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
3113
3306
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
3114
3307
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
3115
3308
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
3116
3309
|
|
3310
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3311
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3312
|
+
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
3313
|
+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
3314
|
+
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
3315
|
+
|
3117
3316
|
const size_t src0_ts = ggml_type_size(src0->type);
|
3118
3317
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
3119
3318
|
|
@@ -3130,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3130
3329
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
3131
3330
|
|
3132
3331
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
3332
|
+
GGML_ASSERT(!(split && ne02 < ne12));
|
3133
3333
|
|
3134
3334
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
3135
3335
|
|
@@ -3166,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3166
3366
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
3167
3367
|
} else {
|
3168
3368
|
row_low = 0;
|
3169
|
-
row_high = nrows0;
|
3369
|
+
row_high = nrows0*i02_divisor;
|
3170
3370
|
}
|
3171
3371
|
if (row_low == row_high) {
|
3172
3372
|
continue;
|
@@ -3214,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3214
3414
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
3215
3415
|
}
|
3216
3416
|
|
3217
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3218
|
-
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
3219
|
-
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3220
|
-
|
3221
3417
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
3222
3418
|
const int64_t i13 = i03 % ne13;
|
3223
3419
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
3224
3420
|
const int64_t i12 = i02 % ne12;
|
3225
3421
|
|
3226
|
-
const int64_t i0 = i03*
|
3422
|
+
const int64_t i0 = i03*i02_max + i02;
|
3227
3423
|
|
3228
3424
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
3229
3425
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
@@ -3257,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3257
3453
|
const int64_t i11 = i13*ne12 + i12;
|
3258
3454
|
|
3259
3455
|
// for split tensors the data begins at i0 == i0_offset_low
|
3260
|
-
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3261
|
-
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
3456
|
+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3457
|
+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
3262
3458
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
3263
|
-
float * dst_ddf_i = dst_ddf[id] + (i0
|
3459
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
3264
3460
|
|
3265
3461
|
// for split tensors the data pointer needs to be rounded down
|
3266
3462
|
// to the bin edge for i03, i02 bins beyond the first
|
@@ -3299,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3299
3495
|
}
|
3300
3496
|
}
|
3301
3497
|
|
3302
|
-
if (!src0_on_device || !src0_is_contiguous) {
|
3498
|
+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
3303
3499
|
if (src0_is_f32) {
|
3304
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3500
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3305
3501
|
} else {
|
3306
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3502
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3307
3503
|
}
|
3308
3504
|
}
|
3309
3505
|
|
@@ -3457,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3457
3653
|
const int64_t ne01 = src0->ne[1];
|
3458
3654
|
const int64_t ne02 = src0->ne[2];
|
3459
3655
|
|
3656
|
+
const int64_t ne12 = src1->ne[2];
|
3657
|
+
|
3460
3658
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3461
3659
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
3462
3660
|
|
@@ -3469,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3469
3667
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3470
3668
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
3471
3669
|
|
3472
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
3670
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
3473
3671
|
}
|
3474
3672
|
|
3475
3673
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -3483,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3483
3681
|
const int64_t ne01 = src0->ne[1];
|
3484
3682
|
const int64_t ne02 = src0->ne[2];
|
3485
3683
|
|
3684
|
+
const int64_t ne12 = src1->ne[2];
|
3685
|
+
|
3486
3686
|
const int64_t nb01 = src0->nb[1];
|
3487
3687
|
const int64_t nb02 = src0->nb[2];
|
3488
3688
|
|
@@ -3501,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3501
3701
|
const int row_stride_x = nb01 / sizeof(half);
|
3502
3702
|
const int channel_stride_x = nb02 / sizeof(half);
|
3503
3703
|
|
3504
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
3704
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
3505
3705
|
}
|
3506
3706
|
|
3507
3707
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3642,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3642
3842
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3643
3843
|
const size_t original_size = size;
|
3644
3844
|
|
3645
|
-
// pad last row to a multiple of
|
3845
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
3646
3846
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3647
3847
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3648
3848
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
@@ -3658,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3658
3858
|
}
|
3659
3859
|
|
3660
3860
|
|
3661
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host,
|
3861
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
3662
3862
|
|
3663
3863
|
extra->data_device[id] = buf;
|
3664
3864
|
|
@@ -3738,7 +3938,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3738
3938
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3739
3939
|
size_t offset = 0;
|
3740
3940
|
if (tensor->op == GGML_OP_VIEW) {
|
3741
|
-
memcpy(&offset, tensor->
|
3941
|
+
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
3742
3942
|
}
|
3743
3943
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3744
3944
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
@@ -3840,18 +4040,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3840
4040
|
}
|
3841
4041
|
func = ggml_cuda_mul;
|
3842
4042
|
break;
|
3843
|
-
case
|
3844
|
-
|
3845
|
-
|
3846
|
-
|
3847
|
-
|
3848
|
-
|
3849
|
-
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
4043
|
+
case GGML_OP_UNARY:
|
4044
|
+
switch (ggml_get_unary_op(tensor)) {
|
4045
|
+
case GGML_UNARY_OP_GELU:
|
4046
|
+
if (!any_on_device) {
|
4047
|
+
return false;
|
4048
|
+
}
|
4049
|
+
func = ggml_cuda_gelu;
|
4050
|
+
break;
|
4051
|
+
case GGML_UNARY_OP_SILU:
|
4052
|
+
if (!any_on_device) {
|
4053
|
+
return false;
|
4054
|
+
}
|
4055
|
+
func = ggml_cuda_silu;
|
4056
|
+
break;
|
4057
|
+
default:
|
4058
|
+
return false;
|
4059
|
+
} break;
|
3855
4060
|
case GGML_OP_NORM:
|
3856
4061
|
if (!any_on_device) {
|
3857
4062
|
return false;
|