llama_cpp 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -220,7 +220,7 @@ typedef struct {
220
220
  static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
221
221
 
222
222
  #define WARP_SIZE 32
223
- #define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
223
+ #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
224
224
 
225
225
  #define CUDA_ADD_BLOCK_SIZE 256
226
226
  #define CUDA_MUL_BLOCK_SIZE 256
@@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
332
332
  }
333
333
  }
334
334
 
335
- static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
335
+ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
336
336
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
337
337
  const int tid = threadIdx.x;
338
338
 
339
- const float eps = 1e-6f;
340
-
341
339
  float tmp = 0.0f; // partial sum for thread in warp
342
340
 
343
341
  for (int col = tid; col < ncols; col += WARP_SIZE) {
@@ -935,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
935
933
  uint16_t aux[4];
936
934
  const uint8_t * sc = (const uint8_t *)aux;
937
935
 
936
+ #if K_QUANTS_PER_ITERATION == 2
937
+ uint32_t q32[4];
938
+ const uint8_t * q4 = (const uint8_t *)q32;
939
+ #else
940
+ uint16_t q16[4];
941
+ const uint8_t * q4 = (const uint8_t *)q16;
942
+ #endif
943
+
938
944
  float tmp = 0; // partial sum for thread in warp
939
945
 
940
946
  for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
941
947
 
942
- const uint8_t * q1 = x[i].qs + q_offset;
943
- const uint8_t * q2 = q1 + 64;
944
948
  const float * y1 = yy + i*QK_K + y_offset;
945
949
  const float * y2 = y1 + 128;
946
950
 
@@ -953,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
953
957
  aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
954
958
  aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
955
959
 
960
+ #if K_QUANTS_PER_ITERATION == 2
961
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
962
+ const uint32_t * q2 = q1 + 16;
963
+
964
+ q32[0] = q1[0] & 0x0f0f0f0f;
965
+ q32[1] = q1[0] & 0xf0f0f0f0;
966
+ q32[2] = q2[0] & 0x0f0f0f0f;
967
+ q32[3] = q2[0] & 0xf0f0f0f0;
968
+
956
969
  float4 s = {0.f, 0.f, 0.f, 0.f};
957
970
  float smin = 0;
958
- for (int l = 0; l < n; ++l) {
959
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
960
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
971
+ for (int l = 0; l < 4; ++l) {
972
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
973
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
961
974
  smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
962
975
  }
963
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
976
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
977
+ #else
978
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
979
+ const uint16_t * q2 = q1 + 32;
980
+
981
+ q16[0] = q1[0] & 0x0f0f;
982
+ q16[1] = q1[0] & 0xf0f0;
983
+ q16[2] = q2[0] & 0x0f0f;
984
+ q16[3] = q2[0] & 0xf0f0;
985
+
986
+ float4 s = {0.f, 0.f, 0.f, 0.f};
987
+ float smin = 0;
988
+ for (int l = 0; l < 2; ++l) {
989
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
990
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
991
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
992
+ }
993
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
994
+ #endif
964
995
 
965
996
  }
966
997
  #else
@@ -1040,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1040
1071
  uint16_t aux[4];
1041
1072
  const uint8_t * sc = (const uint8_t *)aux;
1042
1073
 
1074
+ uint16_t q16[8];
1075
+ const uint8_t * q4 = (const uint8_t *)q16;
1076
+
1043
1077
  for (int i = ix; i < num_blocks_per_row; i += 2) {
1044
1078
 
1045
1079
  const uint8_t * ql1 = x[i].qs + q_offset;
1046
- const uint8_t * ql2 = ql1 + 64;
1047
1080
  const uint8_t * qh = x[i].qh + l0;
1048
1081
  const float * y1 = yy + i*QK_K + y_offset;
1049
1082
  const float * y2 = y1 + 128;
@@ -1059,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1059
1092
 
1060
1093
  float4 sum = {0.f, 0.f, 0.f, 0.f};
1061
1094
  float smin = 0;
1095
+ const uint16_t * q1 = (const uint16_t *)ql1;
1096
+ const uint16_t * q2 = q1 + 32;
1097
+ q16[0] = q1[0] & 0x0f0f;
1098
+ q16[1] = q1[8] & 0x0f0f;
1099
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
1100
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
1101
+ q16[4] = q2[0] & 0x0f0f;
1102
+ q16[5] = q2[8] & 0x0f0f;
1103
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
1104
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
1062
1105
  for (int l = 0; l < n; ++l) {
1063
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1064
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1065
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1066
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1067
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1068
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1069
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1070
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1106
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1107
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1108
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1109
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1110
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1111
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1112
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1113
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1071
1114
  smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
1072
1115
  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
1073
1116
  }
@@ -1521,33 +1564,95 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1521
1564
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
1565
  const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
1566
 
1524
- const int bq8_offset = QR4_K * (iqs / QI8_1);
1525
-
1526
1567
  float sumf_d = 0.0f;
1527
1568
  float sumf_m = 0.0f;
1528
1569
 
1570
+ #ifndef GGML_QKK_64
1571
+
1572
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1573
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
1574
+
1529
1575
  const float d = bq4_K->d;
1530
1576
  const float dmin = bq4_K->dmin;
1531
1577
 
1532
- const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1578
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582
+
1583
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584
+ const int v1 = q4[0];
1585
+ const int v2 = q4[4];
1586
+
1587
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
1588
+ uint16_t aux[2];
1589
+ const int j = bq8_offset/2;
1590
+ if (j < 2) {
1591
+ aux[0] = scales[j+0] & 0x3f3f;
1592
+ aux[1] = scales[j+2] & 0x3f3f;
1593
+ } else {
1594
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1595
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1596
+ }
1597
+ const uint8_t * sc = (const uint8_t *)aux;
1598
+ const uint8_t * m = sc + 2;
1533
1599
 
1534
1600
  for (int i = 0; i < QR4_K; ++i) {
1535
- const int isc = bq8_offset + i;
1536
-
1537
- uint8_t sc, m;
1538
- get_scale_min_k4(isc, bq4_K->scales, sc, m);
1539
1601
 
1540
1602
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1542
1603
  const float d8i = bq8i->d;
1604
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605
+ const int ui1 = q8[0];
1606
+ const int ui2 = q8[4];
1607
+
1608
+ const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609
+ const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
1543
1610
 
1544
- const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1611
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1545
1613
 
1546
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1547
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
1614
+ sumf_d += d8i * (dot1 * sc[i]);
1615
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1548
1616
  }
1549
1617
 
1550
1618
  return d*sumf_d - dmin*sumf_m;
1619
+
1620
+ #else
1621
+
1622
+ uint16_t aux16[2];
1623
+ const uint8_t * s = (const uint8_t *)aux16;
1624
+
1625
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
1626
+ aux16[0] = a[0] & 0x0f0f;
1627
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
1628
+
1629
+ const float dall = bq4_K->d[0];
1630
+ const float dmin = bq4_K->d[1];
1631
+
1632
+ const float d8_1 = bq8_1[0].d;
1633
+ const float d8_2 = bq8_1[1].d;
1634
+
1635
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1636
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1637
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1638
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1639
+
1640
+ const int * q4 = (const int *)bq4_K->qs + iqs;
1641
+ const int v1 = q4[0];
1642
+ const int v2 = q4[4];
1643
+
1644
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
1645
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
1646
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1647
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
1648
+
1649
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
1650
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
1651
+
1652
+ return dall * sumf_d - dmin * sumf_m;
1653
+
1654
+ #endif
1655
+
1551
1656
  #else
1552
1657
  return 0.0f; // only to satisfy the compiler
1553
1658
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1559,7 +1664,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1559
1664
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1560
1665
  const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1561
1666
 
1562
- const int bq8_offset = QR5_K * (iqs / QI8_1);
1667
+ #ifndef GGML_QKK_64
1668
+
1669
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1670
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1671
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
1563
1672
 
1564
1673
  float sumf_d = 0.0f;
1565
1674
  float sumf_m = 0.0f;
@@ -1567,31 +1676,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1567
1676
  const float d = bq5_K->d;
1568
1677
  const float dmin = bq5_K->dmin;
1569
1678
 
1570
- const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1679
+ const int vl1 = ql[0];
1680
+ const int vl2 = ql[4];
1571
1681
 
1572
- const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1682
+ const int vh1 = qh[0] >> bq8_offset;
1683
+ const int vh2 = qh[4] >> bq8_offset;
1573
1684
 
1574
- for (int i = 0; i < QR5_K; ++i) {
1575
- const int isc = bq8_offset + i;
1685
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1686
+ uint16_t aux[2];
1687
+ const int j = bq8_offset/2;
1688
+ if (j < 2) {
1689
+ aux[0] = scales[j+0] & 0x3f3f;
1690
+ aux[1] = scales[j+2] & 0x3f3f;
1691
+ } else {
1692
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1693
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1694
+ }
1695
+ const uint8_t * sc = (const uint8_t *)aux;
1696
+ const uint8_t * m = sc + 2;
1576
1697
 
1577
- uint8_t sc, m;
1578
- get_scale_min_k4(isc, bq5_K->scales, sc, m);
1698
+ for (int i = 0; i < QR5_K; ++i) {
1579
1699
 
1580
1700
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1581
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
1582
1701
  const float d8i = bq8i->d;
1702
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
1703
+ const int ui1 = q8[0];
1704
+ const int ui2 = q8[4];
1583
1705
 
1584
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1706
+ const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1707
+ const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1708
+
1709
+ const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1710
+ const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1711
+
1712
+ const int vi1 = vil1 | vih1;
1713
+ const int vi2 = vil2 | vih2;
1585
1714
 
1586
- const int vih = ((vh >> i) << 4) & 0x10101010;
1715
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1716
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1587
1717
 
1588
- const int vi = vil | vih;
1718
+ sumf_d += d8i * (dot1 * sc[i]);
1719
+ sumf_m += d8i * (dot2 * m[i]);
1589
1720
 
1590
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1591
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
1592
1721
  }
1593
1722
 
1594
1723
  return d*sumf_d - dmin*sumf_m;
1724
+
1725
+ #else
1726
+
1727
+ const int8_t * s = bq5_K->scales;
1728
+
1729
+ const float d = bq5_K->d;
1730
+
1731
+ const float d8_1 = bq8_1[0].d;
1732
+ const float d8_2 = bq8_1[1].d;
1733
+
1734
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
1735
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
1736
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
1737
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
1738
+
1739
+ const int * ql = (const int *)bq5_K->qs + iqs;
1740
+ const int vl1 = ql[0];
1741
+ const int vl2 = ql[4];
1742
+
1743
+ const int step = 4 * iqs; // 0, 4, 8, 12
1744
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
1745
+ const int in = step%8; // 0, 4, 0, 4
1746
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
1747
+
1748
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
1749
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
1750
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
1751
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
1752
+
1753
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
1754
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
1755
+
1756
+ return d * sumf_d;
1757
+
1758
+ #endif
1759
+
1595
1760
  #else
1596
1761
  return 0.0f; // only to satisfy the compiler
1597
1762
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1745,11 +1910,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
1745
1910
  }
1746
1911
  }
1747
1912
 
1748
- static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
1913
+ static __global__ void mul_mat_p021_f16_f32(
1914
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1915
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
1916
+
1749
1917
  const half * x = (const half *) vx;
1750
1918
 
1751
1919
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1752
1920
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1921
+ const int channel_x = channel / (nchannels_y / nchannels_x);
1753
1922
 
1754
1923
  const int nrows_y = ncols_x;
1755
1924
  const int nrows_dst = nrows_x;
@@ -1765,7 +1934,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1765
1934
  }
1766
1935
 
1767
1936
  // x is transposed and permuted
1768
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
1937
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
1769
1938
  const float xi = __half2float(x[ix]);
1770
1939
 
1771
1940
  const int row_y = col_x;
@@ -1793,12 +1962,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
1793
1962
 
1794
1963
  static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1795
1964
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
1796
- const int row_stride_x, const int channel_stride_x) {
1965
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
1797
1966
 
1798
1967
  const half * x = (const half *) vx;
1799
1968
 
1800
1969
  const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
1801
1970
  const int channel = blockDim.z*blockIdx.z + threadIdx.z;
1971
+ const int channel_x = channel / channel_x_divisor;
1802
1972
 
1803
1973
  const int nrows_y = ncols_x;
1804
1974
  const int nrows_dst = nrows_x;
@@ -1815,7 +1985,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
1815
1985
  break;
1816
1986
  }
1817
1987
 
1818
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
1988
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
1819
1989
  const float xi = __half2float(x[ix]);
1820
1990
 
1821
1991
  const int row_y = col_x;
@@ -2027,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
2027
2197
  norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2028
2198
  }
2029
2199
 
2030
- static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2200
+ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
2031
2201
  GGML_ASSERT(ncols % WARP_SIZE == 0);
2032
2202
  const dim3 block_dims(WARP_SIZE, 1, 1);
2033
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
2203
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
2034
2204
  }
2035
2205
 
2036
2206
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@@ -2259,7 +2429,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2259
2429
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2260
2430
  const dim3 block_nums(1, block_num_y, 1);
2261
2431
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2262
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2432
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2433
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2434
+ // is better amortized.
2435
+ mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
2263
2436
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2264
2437
  }
2265
2438
 
@@ -2268,7 +2441,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2268
2441
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2269
2442
  const dim3 block_nums(1, block_num_y, 1);
2270
2443
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2271
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2444
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2445
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2446
+ // is better amortized.
2447
+ mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
2272
2448
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2273
2449
  }
2274
2450
 
@@ -2324,20 +2500,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
2324
2500
  }
2325
2501
  }
2326
2502
 
2327
- static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
2328
- const dim3 block_nums(1, nrows_x, nchannels_x);
2503
+ static void ggml_mul_mat_p021_f16_f32_cuda(
2504
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
2505
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
2506
+
2507
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2329
2508
  const dim3 block_dims(WARP_SIZE, 1, 1);
2330
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
2509
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
2331
2510
  }
2332
2511
 
2333
2512
  static void ggml_mul_mat_vec_nc_f16_f32_cuda(
2334
2513
  const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
2335
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
2514
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
2336
2515
 
2337
- const dim3 block_nums(1, nrows_x, nchannels_x);
2516
+ const dim3 block_nums(1, nrows_x, nchannels_y);
2338
2517
  const dim3 block_dims(WARP_SIZE, 1, 1);
2339
2518
  mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
2340
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
2519
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
2341
2520
  }
2342
2521
 
2343
2522
  static void ggml_cpy_f32_f32_cuda(
@@ -2497,7 +2676,9 @@ static size_t g_scratch_offset = 0;
2497
2676
 
2498
2677
  static int g_device_count = -1;
2499
2678
  static int g_main_device = 0;
2679
+ #ifndef GGML_CUDA_FORCE_DMMV
2500
2680
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2681
+ #endif
2501
2682
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
2502
2683
 
2503
2684
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -2520,7 +2701,9 @@ void ggml_init_cublas() {
2520
2701
  g_tensor_split[id] = total_vram;
2521
2702
  total_vram += prop.totalGlobalMem;
2522
2703
 
2704
+ #ifndef GGML_CUDA_FORCE_DMMV
2523
2705
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
2706
+ #endif
2524
2707
  }
2525
2708
  for (int id = 0; id < g_device_count; ++id) {
2526
2709
  g_tensor_split[id] /= total_vram;
@@ -2688,6 +2871,7 @@ inline void ggml_cuda_op_mul(
2688
2871
  (void) dst;
2689
2872
  (void) src0_ddq_i;
2690
2873
  (void) i02;
2874
+ (void) i1;
2691
2875
  }
2692
2876
 
2693
2877
  inline void ggml_cuda_op_gelu(
@@ -2767,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
2767
2951
  const int64_t ne00 = src0->ne[0];
2768
2952
  const int64_t i01_diff = i01_high - i01_low;
2769
2953
 
2954
+ float eps;
2955
+ memcpy(&eps, dst->op_params, sizeof(float));
2956
+
2770
2957
  // compute
2771
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
2958
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
2772
2959
 
2773
2960
  (void) src1;
2774
2961
  (void) dst;
@@ -2815,8 +3002,8 @@ inline void ggml_cuda_op_mul_mat_vec(
2815
3002
  #endif
2816
3003
 
2817
3004
  if (use_mul_mat_vec_q) {
2818
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
2819
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
3005
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
3006
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
2820
3007
  size_t as;
2821
3008
  void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
2822
3009
  quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
@@ -2983,15 +3170,15 @@ inline void ggml_cuda_op_rope(
2983
3170
  const int64_t ne00 = src0->ne[0];
2984
3171
  const int64_t i01_diff = i01_high - i01_low;
2985
3172
 
2986
- const int n_past = ((int32_t *) src1->data)[0];
2987
- const int n_dims = ((int32_t *) src1->data)[1];
2988
- const int mode = ((int32_t *) src1->data)[2];
2989
- const int n_ctx = ((int32_t *) src1->data)[3];
2990
-
3173
+ const int n_past = ((int32_t *) dst->op_params)[0];
3174
+ const int n_dims = ((int32_t *) dst->op_params)[1];
3175
+ const int mode = ((int32_t *) dst->op_params)[2];
3176
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
2991
3177
  // RoPE alteration for extended context
3178
+
2992
3179
  float freq_base, freq_scale;
2993
- memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
2994
- memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
3180
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
3181
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
2995
3182
 
2996
3183
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
2997
3184
  const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
@@ -3007,6 +3194,7 @@ inline void ggml_cuda_op_rope(
3007
3194
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
3008
3195
  }
3009
3196
 
3197
+ (void) src1;
3010
3198
  (void) dst;
3011
3199
  (void) src0_ddq_i;
3012
3200
  (void) src1_ddf_i;
@@ -3025,11 +3213,12 @@ inline void ggml_cuda_op_diag_mask_inf(
3025
3213
  const int64_t ne01 = src0->ne[1];
3026
3214
  const int64_t i01_diff = i01_high - i01_low;
3027
3215
 
3028
- const int n_past = ((int32_t *) src1->data)[0];
3216
+ const int n_past = ((int32_t *) dst->op_params)[0];
3029
3217
 
3030
3218
  // compute
3031
3219
  diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
3032
3220
 
3221
+ (void) src1;
3033
3222
  (void) dst;
3034
3223
  (void) src0_ddq_i;
3035
3224
  (void) src1_ddf_i;
@@ -3097,6 +3286,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3097
3286
  const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
3098
3287
  const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
3099
3288
  const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
3289
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
3290
+
3291
+ GGML_ASSERT(ne03 == ne13);
3100
3292
 
3101
3293
  const int64_t ne0 = dst->ne[0];
3102
3294
  const int64_t ne1 = dst->ne[1];
@@ -3108,12 +3300,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3108
3300
  GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
3109
3301
 
3110
3302
  // strides for iteration over dims 3 and 2
3111
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
3112
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
3303
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
3304
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
3305
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
3113
3306
  const int64_t src0_stride = ne00 * ne01 * stride_mod;
3114
3307
  const int64_t src1_stride = ne10 * ne11 * stride_mod;
3115
3308
  const int64_t dst_stride = ne0 * ne1 * stride_mod;
3116
3309
 
3310
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3311
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
3312
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
3313
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
3314
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
3315
+
3117
3316
  const size_t src0_ts = ggml_type_size(src0->type);
3118
3317
  const size_t src0_bs = ggml_blck_size(src0->type);
3119
3318
 
@@ -3130,6 +3329,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3130
3329
  dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
3131
3330
 
3132
3331
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
3332
+ GGML_ASSERT(!(split && ne02 < ne12));
3133
3333
 
3134
3334
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
3135
3335
 
@@ -3166,7 +3366,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3166
3366
  row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
3167
3367
  } else {
3168
3368
  row_low = 0;
3169
- row_high = nrows0;
3369
+ row_high = nrows0*i02_divisor;
3170
3370
  }
3171
3371
  if (row_low == row_high) {
3172
3372
  continue;
@@ -3214,16 +3414,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3214
3414
  dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
3215
3415
  }
3216
3416
 
3217
- const int64_t i03_max = flatten_rows ? 1 : ne03;
3218
- const int64_t i02_max = flatten_rows ? 1 : ne02;
3219
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
3220
-
3221
3417
  for (int64_t i03 = 0; i03 < i03_max; i03++) {
3222
3418
  const int64_t i13 = i03 % ne13;
3223
3419
  for (int64_t i02 = 0; i02 < i02_max; i02++) {
3224
3420
  const int64_t i12 = i02 % ne12;
3225
3421
 
3226
- const int64_t i0 = i03*ne02 + i02;
3422
+ const int64_t i0 = i03*i02_max + i02;
3227
3423
 
3228
3424
  // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
3229
3425
  const int64_t i0_offset_low = row_low/rows_per_iter;
@@ -3257,10 +3453,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3257
3453
  const int64_t i11 = i13*ne12 + i12;
3258
3454
 
3259
3455
  // for split tensors the data begins at i0 == i0_offset_low
3260
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3261
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
3456
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
3457
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
3262
3458
  float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
3263
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3459
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
3264
3460
 
3265
3461
  // for split tensors the data pointer needs to be rounded down
3266
3462
  // to the bin edge for i03, i02 bins beyond the first
@@ -3299,11 +3495,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
3299
3495
  }
3300
3496
  }
3301
3497
 
3302
- if (!src0_on_device || !src0_is_contiguous) {
3498
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
3303
3499
  if (src0_is_f32) {
3304
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3500
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3305
3501
  } else {
3306
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
3502
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
3307
3503
  }
3308
3504
  }
3309
3505
 
@@ -3457,6 +3653,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3457
3653
  const int64_t ne01 = src0->ne[1];
3458
3654
  const int64_t ne02 = src0->ne[2];
3459
3655
 
3656
+ const int64_t ne12 = src1->ne[2];
3657
+
3460
3658
  CUDA_CHECK(cudaSetDevice(g_main_device));
3461
3659
  cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
3462
3660
 
@@ -3469,7 +3667,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
3469
3667
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
3470
3668
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
3471
3669
 
3472
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
3670
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
3473
3671
  }
3474
3672
 
3475
3673
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -3483,6 +3681,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3483
3681
  const int64_t ne01 = src0->ne[1];
3484
3682
  const int64_t ne02 = src0->ne[2];
3485
3683
 
3684
+ const int64_t ne12 = src1->ne[2];
3685
+
3486
3686
  const int64_t nb01 = src0->nb[1];
3487
3687
  const int64_t nb02 = src0->nb[2];
3488
3688
 
@@ -3501,7 +3701,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
3501
3701
  const int row_stride_x = nb01 / sizeof(half);
3502
3702
  const int channel_stride_x = nb02 / sizeof(half);
3503
3703
 
3504
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
3704
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
3505
3705
  }
3506
3706
 
3507
3707
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3642,7 +3842,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3642
3842
  size_t size = ggml_nbytes_split(tensor, nrows_split);
3643
3843
  const size_t original_size = size;
3644
3844
 
3645
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
3845
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
3646
3846
  if (ne0 % MATRIX_ROW_PADDING != 0) {
3647
3847
  size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
3648
3848
  * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@@ -3658,7 +3858,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
3658
3858
  }
3659
3859
 
3660
3860
 
3661
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
3861
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
3662
3862
 
3663
3863
  extra->data_device[id] = buf;
3664
3864
 
@@ -3738,7 +3938,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
3738
3938
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
3739
3939
  size_t offset = 0;
3740
3940
  if (tensor->op == GGML_OP_VIEW) {
3741
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
3941
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
3742
3942
  }
3743
3943
  extra = ggml_cuda_alloc_temp_tensor_extra();
3744
3944
  extra->data_device[g_main_device] = src0_ddc + offset;
@@ -3840,18 +4040,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
3840
4040
  }
3841
4041
  func = ggml_cuda_mul;
3842
4042
  break;
3843
- case GGML_OP_GELU:
3844
- if (!any_on_device) {
3845
- return false;
3846
- }
3847
- func = ggml_cuda_gelu;
3848
- break;
3849
- case GGML_OP_SILU:
3850
- if (!any_on_device) {
3851
- return false;
3852
- }
3853
- func = ggml_cuda_silu;
3854
- break;
4043
+ case GGML_OP_UNARY:
4044
+ switch (ggml_get_unary_op(tensor)) {
4045
+ case GGML_UNARY_OP_GELU:
4046
+ if (!any_on_device) {
4047
+ return false;
4048
+ }
4049
+ func = ggml_cuda_gelu;
4050
+ break;
4051
+ case GGML_UNARY_OP_SILU:
4052
+ if (!any_on_device) {
4053
+ return false;
4054
+ }
4055
+ func = ggml_cuda_silu;
4056
+ break;
4057
+ default:
4058
+ return false;
4059
+ } break;
3855
4060
  case GGML_OP_NORM:
3856
4061
  if (!any_on_device) {
3857
4062
  return false;