llama_cpp 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -220,7 +220,7 @@ typedef struct {
220
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
221
221
 
222
222
  #define WARP_SIZE 32
223
- #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
223
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
224
224
 
225
225
  #define CUDA_ADD_BLOCK_SIZE 256
226
226
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
332
332
  }
333
333
  }
334
334
 
335
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
335
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
336
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
337
337
  const int tid = threadIdx.x;
338
338
 
339
- const float eps = 1e-6f;
340
-
341
339
  float tmp = 0.0f; // partial sum for thread in warp
342
340
 
343
341
  for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -935,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
935
933
  uint16_t aux[4];
936
934
  const uint8_t * sc = (const uint8_t *)aux;
937
935
 
936
+ #if K_QUANTS_PER_ITERATION == 2
937
+ uint32_t q32[4];
938
+ const uint8_t * q4 = (const uint8_t *)q32;
939
+ #else
940
+ uint16_t q16[4];
941
+ const uint8_t * q4 = (const uint8_t *)q16;
942
+ #endif
943
+
938
944
  float tmp = 0; // partial sum for thread in warp
939
945
 
940
946
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
941
947
 
942
- const uint8_t * q1 = x[i].qs + q_offset;
943
- const uint8_t * q2 = q1 + 64;
944
948
  const float * y1 = yy + i*QK_K + y_offset;
945
949
  const float * y2 = y1 + 128;
946
950
 
@@ -953,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
953
957
  aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
954
958
  aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
955
959
 
960
+ #if K_QUANTS_PER_ITERATION == 2
961
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
962
+ const uint32_t * q2 = q1 + 16;
963
+
964
+ q32[0] = q1[0] & 0x0f0f0f0f;
965
+ q32[1] = q1[0] & 0xf0f0f0f0;
966
+ q32[2] = q2[0] & 0x0f0f0f0f;
967
+ q32[3] = q2[0] & 0xf0f0f0f0;
968
+
956
969
  float4 s = {0.f, 0.f, 0.f, 0.f};
957
970
  float smin = 0;
958
- for (int l = 0; l < n; ++l) {
959
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
960
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
971
+ for (int l = 0; l < 4; ++l) {
972
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
973
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
961
974
  smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
962
975
  }
963
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
976
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
977
+ #else
978
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
979
+ const uint16_t * q2 = q1 + 32;
980
+
981
+ q16[0] = q1[0] & 0x0f0f;
982
+ q16[1] = q1[0] & 0xf0f0;
983
+ q16[2] = q2[0] & 0x0f0f;
984
+ q16[3] = q2[0] & 0xf0f0;
985
+
986
+ float4 s = {0.f, 0.f, 0.f, 0.f};
987
+ float smin = 0;
988
+ for (int l = 0; l < 2; ++l) {
989
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
990
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
991
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
992
+ }
993
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
994
+ #endif
964
995
 
965
996
  }
966
997
  #else
@@ -1040,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1040
1071
  uint16_t aux[4];
1041
1072
  const uint8_t * sc = (const uint8_t *)aux;
1042
1073
 
1074
+ uint16_t q16[8];
1075
+ const uint8_t * q4 = (const uint8_t *)q16;
1076
+
1043
1077
  for (int i = ix; i < num_blocks_per_row; i += 2) {
1044
1078
 
1045
1079
  const uint8_t * ql1 = x[i].qs + q_offset;
1046
- const uint8_t * ql2 = ql1 + 64;
1047
1080
  const uint8_t * qh = x[i].qh + l0;
1048
1081
  const float * y1 = yy + i*QK_K + y_offset;
1049
1082
  const float * y2 = y1 + 128;
@@ -1059,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1059
1092
 
1060
1093
  float4 sum = {0.f, 0.f, 0.f, 0.f};
1061
1094
  float smin = 0;
1095
+ const uint16_t * q1 = (const uint16_t *)ql1;
1096
+ const uint16_t * q2 = q1 + 32;
1097
+ q16[0] = q1[0] & 0x0f0f;
1098
+ q16[1] = q1[8] & 0x0f0f;
1099
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
1100
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
1101
+ q16[4] = q2[0] & 0x0f0f;
1102
+ q16[5] = q2[8] & 0x0f0f;
1103
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
1104
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
1062
1105
  for (int l = 0; l < n; ++l) {
1063
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1064
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1065
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1066
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1067
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1068
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1069
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1070
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1106
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1107
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1108
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1109
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1110
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1111
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1112
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1113
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1071
1114
  smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
1072
1115
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
1073
1116
  }
@@ -1521,33 +1564,95 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1521
1564
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
1565
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
1566
 
1524
- const int bq8_offset = QR4_K * (iqs / QI8_1);
1525
-
1526
1567
  float sumf_d = 0.0f;
1527
1568
  float sumf_m = 0.0f;
1528
1569
 
1570
+ #ifndef GGML_QKK_64
1571
+
1572
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1574
+
1529
1575
  const float d = bq4_K->d;
1530
1576
  const float dmin = bq4_K->dmin;
1531
1577
 
1532
- const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1578
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582
+
1583
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
+ const int v1 = q4[0];
1585
+ const int v2 = q4[4];
1586
+
1587
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
+ uint16_t aux[2];
1589
+ const int j = bq8_offset/2;
1590
+ if (j < 2) {
1591
+ aux[0] = scales[j+0] & 0x3f3f;
1592
+ aux[1] = scales[j+2] & 0x3f3f;
1593
+ } else {
1594
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
+ }
1597
+ const uint8_t * sc = (const uint8_t *)aux;
1598
+ const uint8_t * m = sc + 2;
1533
1599
 
1534
1600
  for (int i = 0; i < QR4_K; ++i) {
1535
- const int isc = bq8_offset + i;
1536
-
1537
- uint8_t sc, m;
1538
- get_scale_min_k4(isc, bq4_K->scales, sc, m);
1539
1601
 
1540
1602
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
1603
  const float d8i = bq8i->d;
1604
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
+ const int ui1 = q8[0];
1606
+ const int ui2 = q8[4];
1607
+
1608
+ const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
+ const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1543
1610
 
1544
- const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1611
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1545
1613
 
1546
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
1614
+ sumf_d += d8i * (dot1 * sc[i]);
1615
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1548
1616
  }
1549
1617
 
1550
1618
  return d*sumf_d - dmin*sumf_m;
1619
+
1620
+ #else
1621
+
1622
+ uint16_t aux16[2];
1623
+ const uint8_t * s = (const uint8_t *)aux16;
1624
+
1625
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
+ aux16[0] = a[0] & 0x0f0f;
1627
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
1628
+
1629
+ const float dall = bq4_K->d[0];
1630
+ const float dmin = bq4_K->d[1];
1631
+
1632
+ const float d8_1 = bq8_1[0].d;
1633
+ const float d8_2 = bq8_1[1].d;
1634
+
1635
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1639
+
1640
+ const int * q4 = (const int *)bq4_K->qs + iqs;
1641
+ const int v1 = q4[0];
1642
+ const int v2 = q4[4];
1643
+
1644
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1648
+
1649
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1651
+
1652
+ return dall * sumf_d - dmin * sumf_m;
1653
+
1654
+ #endif
1655
+
1551
1656
  #else
1552
1657
  return 0.0f; // only to satisfy the compiler
1553
1658
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1559,7 +1664,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1559
1664
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
1665
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1561
1666
 
1562
- const int bq8_offset = QR5_K * (iqs / QI8_1);
1667
+ #ifndef GGML_QKK_64
1668
+
1669
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1563
1672
 
1564
1673
  float sumf_d = 0.0f;
1565
1674
  float sumf_m = 0.0f;
@@ -1567,31 +1676,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1567
1676
  const float d = bq5_K->d;
1568
1677
  const float dmin = bq5_K->dmin;
1569
1678
 
1570
- const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1679
+ const int vl1 = ql[0];
1680
+ const int vl2 = ql[4];
1571
1681
 
1572
- const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1682
+ const int vh1 = qh[0] >> bq8_offset;
1683
+ const int vh2 = qh[4] >> bq8_offset;
1573
1684
 
1574
- for (int i = 0; i < QR5_K; ++i) {
1575
- const int isc = bq8_offset + i;
1685
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
+ uint16_t aux[2];
1687
+ const int j = bq8_offset/2;
1688
+ if (j < 2) {
1689
+ aux[0] = scales[j+0] & 0x3f3f;
1690
+ aux[1] = scales[j+2] & 0x3f3f;
1691
+ } else {
1692
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
+ }
1695
+ const uint8_t * sc = (const uint8_t *)aux;
1696
+ const uint8_t * m = sc + 2;
1576
1697
 
1577
- uint8_t sc, m;
1578
- get_scale_min_k4(isc, bq5_K->scales, sc, m);
1698
+ for (int i = 0; i < QR5_K; ++i) {
1579
1699
 
1580
1700
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
1701
  const float d8i = bq8i->d;
1702
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
+ const int ui1 = q8[0];
1704
+ const int ui2 = q8[4];
1583
1705
 
1584
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1706
+ const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
+ const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1708
+
1709
+ const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
+ const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1711
+
1712
+ const int vi1 = vil1 | vih1;
1713
+ const int vi2 = vil2 | vih2;
1585
1714
 
1586
- const int vih = ((vh >> i) << 4) & 0x10101010;
1715
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1587
1717
 
1588
- const int vi = vil | vih;
1718
+ sumf_d += d8i * (dot1 * sc[i]);
1719
+ sumf_m += d8i * (dot2 * m[i]);
1589
1720
 
1590
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
1721
  }
1593
1722
 
1594
1723
  return d*sumf_d - dmin*sumf_m;
1724
+
1725
+ #else
1726
+
1727
+ const int8_t * s = bq5_K->scales;
1728
+
1729
+ const float d = bq5_K->d;
1730
+
1731
+ const float d8_1 = bq8_1[0].d;
1732
+ const float d8_2 = bq8_1[1].d;
1733
+
1734
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1738
+
1739
+ const int * ql = (const int *)bq5_K->qs + iqs;
1740
+ const int vl1 = ql[0];
1741
+ const int vl2 = ql[4];
1742
+
1743
+ const int step = 4 * iqs; // 0, 4, 8, 12
1744
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
+ const int in = step%8; // 0, 4, 0, 4
1746
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747
+
1748
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1752
+
1753
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1755
+
1756
+ return d * sumf_d;
1757
+
1758
+ #endif
1759
+
1595
1760
  #else
1596
1761
  return 0.0f; // only to satisfy the compiler
1597
1762
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1745,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1745
1910
  }
1746
1911
  }
1747
1912
 
1748
- static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1913
+ static __global__ void mul_mat_p021_f16_f32(
1914
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1915
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1916
+
1749
1917
  const half * x = (const half *) vx;
1750
1918
 
1751
1919
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1752
1920
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1921
+ const int channel_x = channel / (nchannels_y / nchannels_x);
1753
1922
 
1754
1923
  const int nrows_y = ncols_x;
1755
1924
  const int nrows_dst = nrows_x;
@@ -1765,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1765
1934
  }
1766
1935
 
1767
1936
  // x is transposed and permuted
1768
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1937
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1769
1938
  const float xi = __half2float(x[ix]);
1770
1939
 
1771
1940
  const int row_y = col_x;
@@ -1793,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1793
1962
 
1794
1963
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1795
1964
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1796
- const int row_stride_x, const int channel_stride_x) {
1965
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1797
1966
 
1798
1967
  const half * x = (const half *) vx;
1799
1968
 
1800
1969
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1801
1970
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1971
+ const int channel_x = channel / channel_x_divisor;
1802
1972
 
1803
1973
  const int nrows_y = ncols_x;
1804
1974
  const int nrows_dst = nrows_x;
@@ -1815,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1815
1985
  break;
1816
1986
  }
1817
1987
 
1818
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1988
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1819
1989
  const float xi = __half2float(x[ix]);
1820
1990
 
1821
1991
  const int row_y = col_x;
@@ -2027,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
2027
2197
  norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2028
2198
  }
2029
2199
 
2030
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2200
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
2031
2201
  GGML_ASSERT(ncols % WARP_SIZE == 0);
2032
2202
  const dim3 block_dims(WARP_SIZE, 1, 1);
2033
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2203
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2034
2204
  }
2035
2205
 
2036
2206
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@@ -2259,7 +2429,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2259
2429
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
2430
  const dim3 block_nums(1, block_num_y, 1);
2261
2431
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2432
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
+ // is better amortized.
2435
+ mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
2263
2436
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
2437
  }
2265
2438
 
@@ -2268,7 +2441,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2268
2441
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
2442
  const dim3 block_nums(1, block_num_y, 1);
2270
2443
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2444
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
+ // is better amortized.
2447
+ mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
2272
2448
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
2449
  }
2274
2450
 
@@ -2324,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2324
2500
  }
2325
2501
  }
2326
2502
 
2327
- static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
2328
- const dim3 block_nums(1, nrows_x, nchannels_x);
2503
+ static void ggml_mul_mat_p021_f16_f32_cuda(
2504
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
2506
+
2507
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2329
2508
  const dim3 block_dims(WARP_SIZE, 1, 1);
2330
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
2509
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
2331
2510
  }
2332
2511
 
2333
2512
  static void ggml_mul_mat_vec_nc_f16_f32_cuda(
2334
2513
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
2335
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
2514
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
2336
2515
 
2337
- const dim3 block_nums(1, nrows_x, nchannels_x);
2516
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2338
2517
  const dim3 block_dims(WARP_SIZE, 1, 1);
2339
2518
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
2340
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
2519
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
2341
2520
  }
2342
2521
 
2343
2522
  static void ggml_cpy_f32_f32_cuda(
@@ -2497,7 +2676,9 @@ static size_t g_scratch_offset = 0;
2497
2676
 
2498
2677
  static int g_device_count = -1;
2499
2678
  static int g_main_device = 0;
2679
+ #ifndef GGML_CUDA_FORCE_DMMV
2500
2680
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
+ #endif
2501
2682
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2502
2683
 
2503
2684
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -2520,7 +2701,9 @@ void ggml_init_cublas() {
2520
2701
  g_tensor_split[id] = total_vram;
2521
2702
  total_vram += prop.totalGlobalMem;
2522
2703
 
2704
+ #ifndef GGML_CUDA_FORCE_DMMV
2523
2705
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
+ #endif
2524
2707
  }
2525
2708
  for (int id = 0; id < g_device_count; ++id) {
2526
2709
  g_tensor_split[id] /= total_vram;
@@ -2688,6 +2871,7 @@ inline void ggml_cuda_op_mul(
2688
2871
  (void) dst;
2689
2872
  (void) src0_ddq_i;
2690
2873
  (void) i02;
2874
+ (void) i1;
2691
2875
  }
2692
2876
 
2693
2877
  inline void ggml_cuda_op_gelu(
@@ -2767,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
2767
2951
  const int64_t ne00 = src0->ne[0];
2768
2952
  const int64_t i01_diff = i01_high - i01_low;
2769
2953
 
2954
+ float eps;
2955
+ memcpy(&eps, dst->op_params, sizeof(float));
2956
+
2770
2957
  // compute
2771
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2958
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
2772
2959
 
2773
2960
  (void) src1;
2774
2961
  (void) dst;
@@ -2815,8 +3002,8 @@ inline void ggml_cuda_op_mul_mat_vec(
2815
3002
  #endif
2816
3003
 
2817
3004
  if (use_mul_mat_vec_q) {
2818
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2819
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
3005
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
3006
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
2820
3007
  size_t as;
2821
3008
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2822
3009
  quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
@@ -2983,15 +3170,15 @@ inline void ggml_cuda_op_rope(
2983
3170
  const int64_t ne00 = src0->ne[0];
2984
3171
  const int64_t i01_diff = i01_high - i01_low;
2985
3172
 
2986
- const int n_past = ((int32_t *) src1->data)[0];
2987
- const int n_dims = ((int32_t *) src1->data)[1];
2988
- const int mode = ((int32_t *) src1->data)[2];
2989
- const int n_ctx = ((int32_t *) src1->data)[3];
2990
-
3173
+ const int n_past = ((int32_t *) dst->op_params)[0];
3174
+ const int n_dims = ((int32_t *) dst->op_params)[1];
3175
+ const int mode = ((int32_t *) dst->op_params)[2];
3176
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
2991
3177
  // RoPE alteration for extended context
3178
+
2992
3179
  float freq_base, freq_scale;
2993
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
3180
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
3181
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
2995
3182
 
2996
3183
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
3184
  const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
@@ -3007,6 +3194,7 @@ inline void ggml_cuda_op_rope(
3007
3194
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3008
3195
  }
3009
3196
 
3197
+ (void) src1;
3010
3198
  (void) dst;
3011
3199
  (void) src0_ddq_i;
3012
3200
  (void) src1_ddf_i;
@@ -3025,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
3025
3213
  const int64_t ne01 = src0->ne[1];
3026
3214
  const int64_t i01_diff = i01_high - i01_low;
3027
3215
 
3028
- const int n_past = ((int32_t *) src1->data)[0];
3216
+ const int n_past = ((int32_t *) dst->op_params)[0];
3029
3217
 
3030
3218
  // compute
3031
3219
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
3032
3220
 
3221
+ (void) src1;
3033
3222
  (void) dst;
3034
3223
  (void) src0_ddq_i;
3035
3224
  (void) src1_ddf_i;
@@ -3097,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3097
3286
  const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
3098
3287
  const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
3099
3288
  const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
3289
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
3290
+
3291
+ GGML_ASSERT(ne03 == ne13);
3100
3292
 
3101
3293
  const int64_t ne0 = dst->ne[0];
3102
3294
  const int64_t ne1 = dst->ne[1];
@@ -3108,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3108
3300
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
3109
3301
 
3110
3302
  // strides for iteration over dims 3 and 2
3111
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
3112
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
3303
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
3304
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
3305
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
3113
3306
  const int64_t src0_stride = ne00 * ne01 * stride_mod;
3114
3307
  const int64_t src1_stride = ne10 * ne11 * stride_mod;
3115
3308
  const int64_t dst_stride = ne0 * ne1 * stride_mod;
3116
3309
 
3310
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3311
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
3312
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
3313
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
3314
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
3315
+
3117
3316
  const size_t src0_ts = ggml_type_size(src0->type);
3118
3317
  const size_t src0_bs = ggml_blck_size(src0->type);
3119
3318
 
@@ -3130,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3130
3329
  dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
3131
3330
 
3132
3331
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
3332
+ GGML_ASSERT(!(split && ne02 < ne12));
3133
3333
 
3134
3334
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
3135
3335
 
@@ -3166,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3166
3366
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
3167
3367
  } else {
3168
3368
  row_low = 0;
3169
- row_high = nrows0;
3369
+ row_high = nrows0*i02_divisor;
3170
3370
  }
3171
3371
  if (row_low == row_high) {
3172
3372
  continue;
@@ -3214,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3214
3414
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
3215
3415
  }
3216
3416
 
3217
- const int64_t i03_max = flatten_rows ? 1 : ne03;
3218
- const int64_t i02_max = flatten_rows ? 1 : ne02;
3219
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3220
-
3221
3417
  for (int64_t i03 = 0; i03 < i03_max; i03++) {
3222
3418
  const int64_t i13 = i03 % ne13;
3223
3419
  for (int64_t i02 = 0; i02 < i02_max; i02++) {
3224
3420
  const int64_t i12 = i02 % ne12;
3225
3421
 
3226
- const int64_t i0 = i03*ne02 + i02;
3422
+ const int64_t i0 = i03*i02_max + i02;
3227
3423
 
3228
3424
  // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
3229
3425
  const int64_t i0_offset_low = row_low/rows_per_iter;
@@ -3257,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3257
3453
  const int64_t i11 = i13*ne12 + i12;
3258
3454
 
3259
3455
  // for split tensors the data begins at i0 == i0_offset_low
3260
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3261
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
3456
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3457
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
3262
3458
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
3263
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3459
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3264
3460
 
3265
3461
  // for split tensors the data pointer needs to be rounded down
3266
3462
  // to the bin edge for i03, i02 bins beyond the first
@@ -3299,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3299
3495
  }
3300
3496
  }
3301
3497
 
3302
- if (!src0_on_device || !src0_is_contiguous) {
3498
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
3303
3499
  if (src0_is_f32) {
3304
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3500
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3305
3501
  } else {
3306
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3502
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3307
3503
  }
3308
3504
  }
3309
3505
 
@@ -3457,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3457
3653
  const int64_t ne01 = src0->ne[1];
3458
3654
  const int64_t ne02 = src0->ne[2];
3459
3655
 
3656
+ const int64_t ne12 = src1->ne[2];
3657
+
3460
3658
  CUDA_CHECK(cudaSetDevice(g_main_device));
3461
3659
  cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
3462
3660
 
@@ -3469,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3469
3667
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
3470
3668
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
3471
3669
 
3472
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
3670
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
3473
3671
  }
3474
3672
 
3475
3673
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -3483,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3483
3681
  const int64_t ne01 = src0->ne[1];
3484
3682
  const int64_t ne02 = src0->ne[2];
3485
3683
 
3684
+ const int64_t ne12 = src1->ne[2];
3685
+
3486
3686
  const int64_t nb01 = src0->nb[1];
3487
3687
  const int64_t nb02 = src0->nb[2];
3488
3688
 
@@ -3501,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3501
3701
  const int row_stride_x = nb01 / sizeof(half);
3502
3702
  const int channel_stride_x = nb02 / sizeof(half);
3503
3703
 
3504
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
3704
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
3505
3705
  }
3506
3706
 
3507
3707
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3642,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3642
3842
  size_t size = ggml_nbytes_split(tensor, nrows_split);
3643
3843
  const size_t original_size = size;
3644
3844
 
3645
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3845
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
3646
3846
  if (ne0 % MATRIX_ROW_PADDING != 0) {
3647
3847
  size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3648
3848
  * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@@ -3658,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3658
3858
  }
3659
3859
 
3660
3860
 
3661
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3861
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
3662
3862
 
3663
3863
  extra->data_device[id] = buf;
3664
3864
 
@@ -3738,7 +3938,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3738
3938
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3739
3939
  size_t offset = 0;
3740
3940
  if (tensor->op == GGML_OP_VIEW) {
3741
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3941
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
3742
3942
  }
3743
3943
  extra = ggml_cuda_alloc_temp_tensor_extra();
3744
3944
  extra->data_device[g_main_device] = src0_ddc + offset;
@@ -3840,18 +4040,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3840
4040
  }
3841
4041
  func = ggml_cuda_mul;
3842
4042
  break;
3843
- case GGML_OP_GELU:
3844
- if (!any_on_device) {
3845
- return false;
3846
- }
3847
- func = ggml_cuda_gelu;
3848
- break;
3849
- case GGML_OP_SILU:
3850
- if (!any_on_device) {
3851
- return false;
3852
- }
3853
- func = ggml_cuda_silu;
3854
- break;
4043
+ case GGML_OP_UNARY:
4044
+ switch (ggml_get_unary_op(tensor)) {
4045
+ case GGML_UNARY_OP_GELU:
4046
+ if (!any_on_device) {
4047
+ return false;
4048
+ }
4049
+ func = ggml_cuda_gelu;
4050
+ break;
4051
+ case GGML_UNARY_OP_SILU:
4052
+ if (!any_on_device) {
4053
+ return false;
4054
+ }
4055
+ func = ggml_cuda_silu;
4056
+ break;
4057
+ default:
4058
+ return false;
4059
+ } break;
3855
4060
  case GGML_OP_NORM:
3856
4061
  if (!any_on_device) {
3857
4062
  return false;