llama_cpp 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +293 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +304 -99
- data/ext/llama_cpp/src/ggml-metal.h +7 -0
- data/ext/llama_cpp/src/ggml-metal.m +201 -71
- data/ext/llama_cpp/src/ggml-metal.metal +68 -54
- data/ext/llama_cpp/src/ggml.c +713 -978
- data/ext/llama_cpp/src/ggml.h +82 -17
- data/ext/llama_cpp/src/k_quants.c +327 -3
- data/ext/llama_cpp/src/llama.cpp +524 -121
- data/ext/llama_cpp/src/llama.h +60 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +24 -0
- metadata +2 -2
@@ -220,7 +220,7 @@ typedef struct {
|
|
220
220
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
221
221
|
|
222
222
|
#define WARP_SIZE 32
|
223
|
-
#define MATRIX_ROW_PADDING
|
223
|
+
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
224
224
|
|
225
225
|
#define CUDA_ADD_BLOCK_SIZE 256
|
226
226
|
#define CUDA_MUL_BLOCK_SIZE 256
|
@@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
332
332
|
}
|
333
333
|
}
|
334
334
|
|
335
|
-
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
335
|
+
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
336
336
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
337
337
|
const int tid = threadIdx.x;
|
338
338
|
|
339
|
-
const float eps = 1e-6f;
|
340
|
-
|
341
339
|
float tmp = 0.0f; // partial sum for thread in warp
|
342
340
|
|
343
341
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
@@ -935,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
935
933
|
uint16_t aux[4];
|
936
934
|
const uint8_t * sc = (const uint8_t *)aux;
|
937
935
|
|
936
|
+
#if K_QUANTS_PER_ITERATION == 2
|
937
|
+
uint32_t q32[4];
|
938
|
+
const uint8_t * q4 = (const uint8_t *)q32;
|
939
|
+
#else
|
940
|
+
uint16_t q16[4];
|
941
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
942
|
+
#endif
|
943
|
+
|
938
944
|
float tmp = 0; // partial sum for thread in warp
|
939
945
|
|
940
946
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
941
947
|
|
942
|
-
const uint8_t * q1 = x[i].qs + q_offset;
|
943
|
-
const uint8_t * q2 = q1 + 64;
|
944
948
|
const float * y1 = yy + i*QK_K + y_offset;
|
945
949
|
const float * y2 = y1 + 128;
|
946
950
|
|
@@ -953,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
953
957
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
954
958
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
955
959
|
|
960
|
+
#if K_QUANTS_PER_ITERATION == 2
|
961
|
+
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
962
|
+
const uint32_t * q2 = q1 + 16;
|
963
|
+
|
964
|
+
q32[0] = q1[0] & 0x0f0f0f0f;
|
965
|
+
q32[1] = q1[0] & 0xf0f0f0f0;
|
966
|
+
q32[2] = q2[0] & 0x0f0f0f0f;
|
967
|
+
q32[3] = q2[0] & 0xf0f0f0f0;
|
968
|
+
|
956
969
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
957
970
|
float smin = 0;
|
958
|
-
for (int l = 0; l <
|
959
|
-
s.x += y1[l] *
|
960
|
-
s.z += y2[l] *
|
971
|
+
for (int l = 0; l < 4; ++l) {
|
972
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
973
|
+
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
961
974
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
962
975
|
}
|
963
|
-
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
976
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
977
|
+
#else
|
978
|
+
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
979
|
+
const uint16_t * q2 = q1 + 32;
|
980
|
+
|
981
|
+
q16[0] = q1[0] & 0x0f0f;
|
982
|
+
q16[1] = q1[0] & 0xf0f0;
|
983
|
+
q16[2] = q2[0] & 0x0f0f;
|
984
|
+
q16[3] = q2[0] & 0xf0f0;
|
985
|
+
|
986
|
+
float4 s = {0.f, 0.f, 0.f, 0.f};
|
987
|
+
float smin = 0;
|
988
|
+
for (int l = 0; l < 2; ++l) {
|
989
|
+
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
990
|
+
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
991
|
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
992
|
+
}
|
993
|
+
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
994
|
+
#endif
|
964
995
|
|
965
996
|
}
|
966
997
|
#else
|
@@ -1040,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1040
1071
|
uint16_t aux[4];
|
1041
1072
|
const uint8_t * sc = (const uint8_t *)aux;
|
1042
1073
|
|
1074
|
+
uint16_t q16[8];
|
1075
|
+
const uint8_t * q4 = (const uint8_t *)q16;
|
1076
|
+
|
1043
1077
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
1044
1078
|
|
1045
1079
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
1046
|
-
const uint8_t * ql2 = ql1 + 64;
|
1047
1080
|
const uint8_t * qh = x[i].qh + l0;
|
1048
1081
|
const float * y1 = yy + i*QK_K + y_offset;
|
1049
1082
|
const float * y2 = y1 + 128;
|
@@ -1059,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1059
1092
|
|
1060
1093
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
1061
1094
|
float smin = 0;
|
1095
|
+
const uint16_t * q1 = (const uint16_t *)ql1;
|
1096
|
+
const uint16_t * q2 = q1 + 32;
|
1097
|
+
q16[0] = q1[0] & 0x0f0f;
|
1098
|
+
q16[1] = q1[8] & 0x0f0f;
|
1099
|
+
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
1100
|
+
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
1101
|
+
q16[4] = q2[0] & 0x0f0f;
|
1102
|
+
q16[5] = q2[8] & 0x0f0f;
|
1103
|
+
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
1104
|
+
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
1062
1105
|
for (int l = 0; l < n; ++l) {
|
1063
|
-
sum.x += y1[l+ 0] * (
|
1064
|
-
+ y1[l+16] * (
|
1065
|
-
sum.y += y1[l+32] * (
|
1066
|
-
+ y1[l+48] * (
|
1067
|
-
sum.z += y2[l+ 0] * (
|
1068
|
-
+ y2[l+16] * (
|
1069
|
-
sum.w += y2[l+32] * (
|
1070
|
-
+ y2[l+48] * (
|
1106
|
+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
1107
|
+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
1108
|
+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
1109
|
+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
1110
|
+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
1111
|
+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
1112
|
+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
1113
|
+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
1071
1114
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
1072
1115
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
1073
1116
|
}
|
@@ -1521,33 +1564,95 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
1521
1564
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1522
1565
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
1523
1566
|
|
1524
|
-
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
1525
|
-
|
1526
1567
|
float sumf_d = 0.0f;
|
1527
1568
|
float sumf_m = 0.0f;
|
1528
1569
|
|
1570
|
+
#ifndef GGML_QKK_64
|
1571
|
+
|
1572
|
+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
|
1573
|
+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
|
1574
|
+
|
1529
1575
|
const float d = bq4_K->d;
|
1530
1576
|
const float dmin = bq4_K->dmin;
|
1531
1577
|
|
1532
|
-
|
1578
|
+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
1579
|
+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
1580
|
+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
1581
|
+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
1582
|
+
|
1583
|
+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1584
|
+
const int v1 = q4[0];
|
1585
|
+
const int v2 = q4[4];
|
1586
|
+
|
1587
|
+
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
1588
|
+
uint16_t aux[2];
|
1589
|
+
const int j = bq8_offset/2;
|
1590
|
+
if (j < 2) {
|
1591
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1592
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1593
|
+
} else {
|
1594
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1595
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1596
|
+
}
|
1597
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1598
|
+
const uint8_t * m = sc + 2;
|
1533
1599
|
|
1534
1600
|
for (int i = 0; i < QR4_K; ++i) {
|
1535
|
-
const int isc = bq8_offset + i;
|
1536
|
-
|
1537
|
-
uint8_t sc, m;
|
1538
|
-
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
1539
1601
|
|
1540
1602
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1541
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1542
1603
|
const float d8i = bq8i->d;
|
1604
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1605
|
+
const int ui1 = q8[0];
|
1606
|
+
const int ui2 = q8[4];
|
1607
|
+
|
1608
|
+
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
1609
|
+
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
1543
1610
|
|
1544
|
-
const int
|
1611
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1612
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1545
1613
|
|
1546
|
-
sumf_d += d8i * (
|
1547
|
-
sumf_m += d8i * (
|
1614
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1615
|
+
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1548
1616
|
}
|
1549
1617
|
|
1550
1618
|
return d*sumf_d - dmin*sumf_m;
|
1619
|
+
|
1620
|
+
#else
|
1621
|
+
|
1622
|
+
uint16_t aux16[2];
|
1623
|
+
const uint8_t * s = (const uint8_t *)aux16;
|
1624
|
+
|
1625
|
+
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
1626
|
+
aux16[0] = a[0] & 0x0f0f;
|
1627
|
+
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1628
|
+
|
1629
|
+
const float dall = bq4_K->d[0];
|
1630
|
+
const float dmin = bq4_K->d[1];
|
1631
|
+
|
1632
|
+
const float d8_1 = bq8_1[0].d;
|
1633
|
+
const float d8_2 = bq8_1[1].d;
|
1634
|
+
|
1635
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1636
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1637
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1638
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1639
|
+
|
1640
|
+
const int * q4 = (const int *)bq4_K->qs + iqs;
|
1641
|
+
const int v1 = q4[0];
|
1642
|
+
const int v2 = q4[4];
|
1643
|
+
|
1644
|
+
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
1645
|
+
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
1646
|
+
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1647
|
+
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
1648
|
+
|
1649
|
+
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
1650
|
+
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
1651
|
+
|
1652
|
+
return dall * sumf_d - dmin * sumf_m;
|
1653
|
+
|
1654
|
+
#endif
|
1655
|
+
|
1551
1656
|
#else
|
1552
1657
|
return 0.0f; // only to satisfy the compiler
|
1553
1658
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1559,7 +1664,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
1559
1664
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1560
1665
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
1561
1666
|
|
1562
|
-
|
1667
|
+
#ifndef GGML_QKK_64
|
1668
|
+
|
1669
|
+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
|
1670
|
+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
1671
|
+
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
1563
1672
|
|
1564
1673
|
float sumf_d = 0.0f;
|
1565
1674
|
float sumf_m = 0.0f;
|
@@ -1567,31 +1676,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
1567
1676
|
const float d = bq5_K->d;
|
1568
1677
|
const float dmin = bq5_K->dmin;
|
1569
1678
|
|
1570
|
-
const int
|
1679
|
+
const int vl1 = ql[0];
|
1680
|
+
const int vl2 = ql[4];
|
1571
1681
|
|
1572
|
-
const int
|
1682
|
+
const int vh1 = qh[0] >> bq8_offset;
|
1683
|
+
const int vh2 = qh[4] >> bq8_offset;
|
1573
1684
|
|
1574
|
-
|
1575
|
-
|
1685
|
+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
1686
|
+
uint16_t aux[2];
|
1687
|
+
const int j = bq8_offset/2;
|
1688
|
+
if (j < 2) {
|
1689
|
+
aux[0] = scales[j+0] & 0x3f3f;
|
1690
|
+
aux[1] = scales[j+2] & 0x3f3f;
|
1691
|
+
} else {
|
1692
|
+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
1693
|
+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
1694
|
+
}
|
1695
|
+
const uint8_t * sc = (const uint8_t *)aux;
|
1696
|
+
const uint8_t * m = sc + 2;
|
1576
1697
|
|
1577
|
-
|
1578
|
-
get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
1698
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1579
1699
|
|
1580
1700
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
1581
|
-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
1582
1701
|
const float d8i = bq8i->d;
|
1702
|
+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
1703
|
+
const int ui1 = q8[0];
|
1704
|
+
const int ui2 = q8[4];
|
1583
1705
|
|
1584
|
-
const int
|
1706
|
+
const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
|
1707
|
+
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
1708
|
+
|
1709
|
+
const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
|
1710
|
+
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
1711
|
+
|
1712
|
+
const int vi1 = vil1 | vih1;
|
1713
|
+
const int vi2 = vil2 | vih2;
|
1585
1714
|
|
1586
|
-
const int
|
1715
|
+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
1716
|
+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
1587
1717
|
|
1588
|
-
|
1718
|
+
sumf_d += d8i * (dot1 * sc[i]);
|
1719
|
+
sumf_m += d8i * (dot2 * m[i]);
|
1589
1720
|
|
1590
|
-
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
1591
|
-
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
1592
1721
|
}
|
1593
1722
|
|
1594
1723
|
return d*sumf_d - dmin*sumf_m;
|
1724
|
+
|
1725
|
+
#else
|
1726
|
+
|
1727
|
+
const int8_t * s = bq5_K->scales;
|
1728
|
+
|
1729
|
+
const float d = bq5_K->d;
|
1730
|
+
|
1731
|
+
const float d8_1 = bq8_1[0].d;
|
1732
|
+
const float d8_2 = bq8_1[1].d;
|
1733
|
+
|
1734
|
+
const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
1735
|
+
const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
1736
|
+
const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
1737
|
+
const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
1738
|
+
|
1739
|
+
const int * ql = (const int *)bq5_K->qs + iqs;
|
1740
|
+
const int vl1 = ql[0];
|
1741
|
+
const int vl2 = ql[4];
|
1742
|
+
|
1743
|
+
const int step = 4 * iqs; // 0, 4, 8, 12
|
1744
|
+
const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
|
1745
|
+
const int in = step%8; // 0, 4, 0, 4
|
1746
|
+
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
1747
|
+
|
1748
|
+
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
1749
|
+
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
1750
|
+
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
1751
|
+
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
1752
|
+
|
1753
|
+
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
1754
|
+
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
1755
|
+
|
1756
|
+
return d * sumf_d;
|
1757
|
+
|
1758
|
+
#endif
|
1759
|
+
|
1595
1760
|
#else
|
1596
1761
|
return 0.0f; // only to satisfy the compiler
|
1597
1762
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1745,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
1745
1910
|
}
|
1746
1911
|
}
|
1747
1912
|
|
1748
|
-
static __global__ void mul_mat_p021_f16_f32(
|
1913
|
+
static __global__ void mul_mat_p021_f16_f32(
|
1914
|
+
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
1915
|
+
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
1916
|
+
|
1749
1917
|
const half * x = (const half *) vx;
|
1750
1918
|
|
1751
1919
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1752
1920
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1921
|
+
const int channel_x = channel / (nchannels_y / nchannels_x);
|
1753
1922
|
|
1754
1923
|
const int nrows_y = ncols_x;
|
1755
1924
|
const int nrows_dst = nrows_x;
|
@@ -1765,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1765
1934
|
}
|
1766
1935
|
|
1767
1936
|
// x is transposed and permuted
|
1768
|
-
const int ix = row_x*nchannels_x*ncols_x +
|
1937
|
+
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
1769
1938
|
const float xi = __half2float(x[ix]);
|
1770
1939
|
|
1771
1940
|
const int row_y = col_x;
|
@@ -1793,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
1793
1962
|
|
1794
1963
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
1795
1964
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
1796
|
-
const int row_stride_x, const int channel_stride_x) {
|
1965
|
+
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
1797
1966
|
|
1798
1967
|
const half * x = (const half *) vx;
|
1799
1968
|
|
1800
1969
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
1801
1970
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
1971
|
+
const int channel_x = channel / channel_x_divisor;
|
1802
1972
|
|
1803
1973
|
const int nrows_y = ncols_x;
|
1804
1974
|
const int nrows_dst = nrows_x;
|
@@ -1815,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
1815
1985
|
break;
|
1816
1986
|
}
|
1817
1987
|
|
1818
|
-
const int ix =
|
1988
|
+
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
1819
1989
|
const float xi = __half2float(x[ix]);
|
1820
1990
|
|
1821
1991
|
const int row_y = col_x;
|
@@ -2027,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
2027
2197
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2028
2198
|
}
|
2029
2199
|
|
2030
|
-
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
2200
|
+
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
2031
2201
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
2032
2202
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2033
|
-
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
2203
|
+
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
2034
2204
|
}
|
2035
2205
|
|
2036
2206
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
@@ -2259,7 +2429,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2259
2429
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2260
2430
|
const dim3 block_nums(1, block_num_y, 1);
|
2261
2431
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2262
|
-
|
2432
|
+
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
|
2433
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2434
|
+
// is better amortized.
|
2435
|
+
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
2263
2436
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2264
2437
|
}
|
2265
2438
|
|
@@ -2268,7 +2441,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
2268
2441
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
2269
2442
|
const dim3 block_nums(1, block_num_y, 1);
|
2270
2443
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
2271
|
-
|
2444
|
+
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
|
2445
|
+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
2446
|
+
// is better amortized.
|
2447
|
+
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
2272
2448
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
2273
2449
|
}
|
2274
2450
|
|
@@ -2324,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
2324
2500
|
}
|
2325
2501
|
}
|
2326
2502
|
|
2327
|
-
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2328
|
-
const
|
2503
|
+
static void ggml_mul_mat_p021_f16_f32_cuda(
|
2504
|
+
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
2505
|
+
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
2506
|
+
|
2507
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2329
2508
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2330
|
-
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
2509
|
+
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
2331
2510
|
}
|
2332
2511
|
|
2333
2512
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
2334
2513
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
2335
|
-
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
2514
|
+
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
2336
2515
|
|
2337
|
-
const dim3 block_nums(1, nrows_x,
|
2516
|
+
const dim3 block_nums(1, nrows_x, nchannels_y);
|
2338
2517
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
2339
2518
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
2340
|
-
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
2519
|
+
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
2341
2520
|
}
|
2342
2521
|
|
2343
2522
|
static void ggml_cpy_f32_f32_cuda(
|
@@ -2497,7 +2676,9 @@ static size_t g_scratch_offset = 0;
|
|
2497
2676
|
|
2498
2677
|
static int g_device_count = -1;
|
2499
2678
|
static int g_main_device = 0;
|
2679
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2500
2680
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
2681
|
+
#endif
|
2501
2682
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
2502
2683
|
|
2503
2684
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -2520,7 +2701,9 @@ void ggml_init_cublas() {
|
|
2520
2701
|
g_tensor_split[id] = total_vram;
|
2521
2702
|
total_vram += prop.totalGlobalMem;
|
2522
2703
|
|
2704
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
2523
2705
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
2706
|
+
#endif
|
2524
2707
|
}
|
2525
2708
|
for (int id = 0; id < g_device_count; ++id) {
|
2526
2709
|
g_tensor_split[id] /= total_vram;
|
@@ -2688,6 +2871,7 @@ inline void ggml_cuda_op_mul(
|
|
2688
2871
|
(void) dst;
|
2689
2872
|
(void) src0_ddq_i;
|
2690
2873
|
(void) i02;
|
2874
|
+
(void) i1;
|
2691
2875
|
}
|
2692
2876
|
|
2693
2877
|
inline void ggml_cuda_op_gelu(
|
@@ -2767,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
2767
2951
|
const int64_t ne00 = src0->ne[0];
|
2768
2952
|
const int64_t i01_diff = i01_high - i01_low;
|
2769
2953
|
|
2954
|
+
float eps;
|
2955
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
2956
|
+
|
2770
2957
|
// compute
|
2771
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2958
|
+
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
2772
2959
|
|
2773
2960
|
(void) src1;
|
2774
2961
|
(void) dst;
|
@@ -2815,8 +3002,8 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
2815
3002
|
#endif
|
2816
3003
|
|
2817
3004
|
if (use_mul_mat_vec_q) {
|
2818
|
-
int64_t padded_row_size = ne00
|
2819
|
-
|
3005
|
+
const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
3006
|
+
ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
2820
3007
|
size_t as;
|
2821
3008
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
2822
3009
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
@@ -2983,15 +3170,15 @@ inline void ggml_cuda_op_rope(
|
|
2983
3170
|
const int64_t ne00 = src0->ne[0];
|
2984
3171
|
const int64_t i01_diff = i01_high - i01_low;
|
2985
3172
|
|
2986
|
-
const int n_past = ((int32_t *)
|
2987
|
-
const int n_dims = ((int32_t *)
|
2988
|
-
const int mode = ((int32_t *)
|
2989
|
-
const int n_ctx = ((int32_t *)
|
2990
|
-
|
3173
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3174
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
3175
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
3176
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
2991
3177
|
// RoPE alteration for extended context
|
3178
|
+
|
2992
3179
|
float freq_base, freq_scale;
|
2993
|
-
memcpy(&freq_base, (int32_t *)
|
2994
|
-
memcpy(&freq_scale, (int32_t *)
|
3180
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
3181
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
2995
3182
|
|
2996
3183
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
2997
3184
|
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
@@ -3007,6 +3194,7 @@ inline void ggml_cuda_op_rope(
|
|
3007
3194
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
3008
3195
|
}
|
3009
3196
|
|
3197
|
+
(void) src1;
|
3010
3198
|
(void) dst;
|
3011
3199
|
(void) src0_ddq_i;
|
3012
3200
|
(void) src1_ddf_i;
|
@@ -3025,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
3025
3213
|
const int64_t ne01 = src0->ne[1];
|
3026
3214
|
const int64_t i01_diff = i01_high - i01_low;
|
3027
3215
|
|
3028
|
-
const int n_past = ((int32_t *)
|
3216
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
3029
3217
|
|
3030
3218
|
// compute
|
3031
3219
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
3032
3220
|
|
3221
|
+
(void) src1;
|
3033
3222
|
(void) dst;
|
3034
3223
|
(void) src0_ddq_i;
|
3035
3224
|
(void) src1_ddf_i;
|
@@ -3097,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3097
3286
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
3098
3287
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
3099
3288
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
3289
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
3290
|
+
|
3291
|
+
GGML_ASSERT(ne03 == ne13);
|
3100
3292
|
|
3101
3293
|
const int64_t ne0 = dst->ne[0];
|
3102
3294
|
const int64_t ne1 = dst->ne[1];
|
@@ -3108,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3108
3300
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
3109
3301
|
|
3110
3302
|
// strides for iteration over dims 3 and 2
|
3111
|
-
const int64_t
|
3112
|
-
const int64_t
|
3303
|
+
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
3304
|
+
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
3305
|
+
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
3113
3306
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
3114
3307
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
3115
3308
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
3116
3309
|
|
3310
|
+
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3311
|
+
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3312
|
+
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
3313
|
+
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
3314
|
+
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
3315
|
+
|
3117
3316
|
const size_t src0_ts = ggml_type_size(src0->type);
|
3118
3317
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
3119
3318
|
|
@@ -3130,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3130
3329
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
3131
3330
|
|
3132
3331
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
3332
|
+
GGML_ASSERT(!(split && ne02 < ne12));
|
3133
3333
|
|
3134
3334
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
3135
3335
|
|
@@ -3166,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3166
3366
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
3167
3367
|
} else {
|
3168
3368
|
row_low = 0;
|
3169
|
-
row_high = nrows0;
|
3369
|
+
row_high = nrows0*i02_divisor;
|
3170
3370
|
}
|
3171
3371
|
if (row_low == row_high) {
|
3172
3372
|
continue;
|
@@ -3214,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3214
3414
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
3215
3415
|
}
|
3216
3416
|
|
3217
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
3218
|
-
const int64_t i02_max = flatten_rows ? 1 : ne02;
|
3219
|
-
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
3220
|
-
|
3221
3417
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
3222
3418
|
const int64_t i13 = i03 % ne13;
|
3223
3419
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
3224
3420
|
const int64_t i12 = i02 % ne12;
|
3225
3421
|
|
3226
|
-
const int64_t i0 = i03*
|
3422
|
+
const int64_t i0 = i03*i02_max + i02;
|
3227
3423
|
|
3228
3424
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
3229
3425
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
@@ -3257,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3257
3453
|
const int64_t i11 = i13*ne12 + i12;
|
3258
3454
|
|
3259
3455
|
// for split tensors the data begins at i0 == i0_offset_low
|
3260
|
-
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3261
|
-
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
3456
|
+
char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
3457
|
+
float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
3262
3458
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
3263
|
-
float * dst_ddf_i = dst_ddf[id] + (i0
|
3459
|
+
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
3264
3460
|
|
3265
3461
|
// for split tensors the data pointer needs to be rounded down
|
3266
3462
|
// to the bin edge for i03, i02 bins beyond the first
|
@@ -3299,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
3299
3495
|
}
|
3300
3496
|
}
|
3301
3497
|
|
3302
|
-
if (!src0_on_device || !src0_is_contiguous) {
|
3498
|
+
if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
3303
3499
|
if (src0_is_f32) {
|
3304
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3500
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3305
3501
|
} else {
|
3306
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
3502
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
3307
3503
|
}
|
3308
3504
|
}
|
3309
3505
|
|
@@ -3457,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3457
3653
|
const int64_t ne01 = src0->ne[1];
|
3458
3654
|
const int64_t ne02 = src0->ne[2];
|
3459
3655
|
|
3656
|
+
const int64_t ne12 = src1->ne[2];
|
3657
|
+
|
3460
3658
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
3461
3659
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
3462
3660
|
|
@@ -3469,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
3469
3667
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
3470
3668
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
3471
3669
|
|
3472
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
3670
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
3473
3671
|
}
|
3474
3672
|
|
3475
3673
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -3483,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3483
3681
|
const int64_t ne01 = src0->ne[1];
|
3484
3682
|
const int64_t ne02 = src0->ne[2];
|
3485
3683
|
|
3684
|
+
const int64_t ne12 = src1->ne[2];
|
3685
|
+
|
3486
3686
|
const int64_t nb01 = src0->nb[1];
|
3487
3687
|
const int64_t nb02 = src0->nb[2];
|
3488
3688
|
|
@@ -3501,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
3501
3701
|
const int row_stride_x = nb01 / sizeof(half);
|
3502
3702
|
const int channel_stride_x = nb02 / sizeof(half);
|
3503
3703
|
|
3504
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
3704
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
3505
3705
|
}
|
3506
3706
|
|
3507
3707
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3642,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3642
3842
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
3643
3843
|
const size_t original_size = size;
|
3644
3844
|
|
3645
|
-
// pad last row to a multiple of
|
3845
|
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
3646
3846
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
3647
3847
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
3648
3848
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
@@ -3658,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
3658
3858
|
}
|
3659
3859
|
|
3660
3860
|
|
3661
|
-
CUDA_CHECK(cudaMemcpy(buf, buf_host,
|
3861
|
+
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
3662
3862
|
|
3663
3863
|
extra->data_device[id] = buf;
|
3664
3864
|
|
@@ -3738,7 +3938,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
3738
3938
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
3739
3939
|
size_t offset = 0;
|
3740
3940
|
if (tensor->op == GGML_OP_VIEW) {
|
3741
|
-
memcpy(&offset, tensor->
|
3941
|
+
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
3742
3942
|
}
|
3743
3943
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
3744
3944
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
@@ -3840,18 +4040,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
3840
4040
|
}
|
3841
4041
|
func = ggml_cuda_mul;
|
3842
4042
|
break;
|
3843
|
-
case
|
3844
|
-
|
3845
|
-
|
3846
|
-
|
3847
|
-
|
3848
|
-
|
3849
|
-
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
|
3854
|
-
|
4043
|
+
case GGML_OP_UNARY:
|
4044
|
+
switch (ggml_get_unary_op(tensor)) {
|
4045
|
+
case GGML_UNARY_OP_GELU:
|
4046
|
+
if (!any_on_device) {
|
4047
|
+
return false;
|
4048
|
+
}
|
4049
|
+
func = ggml_cuda_gelu;
|
4050
|
+
break;
|
4051
|
+
case GGML_UNARY_OP_SILU:
|
4052
|
+
if (!any_on_device) {
|
4053
|
+
return false;
|
4054
|
+
}
|
4055
|
+
func = ggml_cuda_silu;
|
4056
|
+
break;
|
4057
|
+
default:
|
4058
|
+
return false;
|
4059
|
+
} break;
|
3855
4060
|
case GGML_OP_NORM:
|
3856
4061
|
if (!any_on_device) {
|
3857
4062
|
return false;
|