llama_cpp 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
262
263
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
263
264
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
264
265
 
265
- #ifndef GGML_CUDA_MMQ_Y
266
- #define GGML_CUDA_MMQ_Y 64
267
- #endif // GGML_CUDA_MMQ_Y
268
-
269
266
  // dmmv = dequantize_mul_mat_vec
270
267
  #ifndef GGML_CUDA_DMMV_X
271
268
  #define GGML_CUDA_DMMV_X 32
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
285
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
286
283
  };
287
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
288
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
289
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
290
301
 
@@ -1383,8 +1394,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1383
1394
  sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
1395
  }
1385
1396
 
1397
+ const float2 ds8f = __half22float2(ds8);
1398
+
1386
1399
  // second part effectively subtracts 8 from each quant value
1387
- return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1388
1401
  #else
1389
1402
  return 0.0f; // only to satisfy the compiler
1390
1403
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1410,12 +1423,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1410
1423
  }
1411
1424
 
1412
1425
  #ifdef GGML_CUDA_F16
1413
- const half2 tmp = __hmul2(dm4, ds8);
1414
- const float d4d8 = __half2float(tmp.x);
1415
- const float m4s8 = __half2float(tmp.y);
1426
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1427
+ const float d4d8 = tmp.x;
1428
+ const float m4s8 = tmp.y;
1416
1429
  #else
1417
- const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
- const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1430
+ const float2 dm4f = __half22float2(dm4);
1431
+ const float2 ds8f = __half22float2(ds8);
1432
+ const float d4d8 = dm4f.x * ds8f.x;
1433
+ const float m4s8 = dm4f.y * ds8f.y;
1419
1434
  #endif // GGML_CUDA_F16
1420
1435
 
1421
1436
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
@@ -1434,6 +1449,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1434
1449
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1435
1450
  int sumi = 0;
1436
1451
 
1452
+ #pragma unroll
1437
1453
  for (int i = 0; i < vdr; ++i) {
1438
1454
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
1455
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1450,8 +1466,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1450
1466
  sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
1467
  }
1452
1468
 
1469
+ const float2 ds8f = __half22float2(ds8);
1470
+
1453
1471
  // second part effectively subtracts 16 from each quant value
1454
- return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1472
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1455
1473
  #else
1456
1474
  return 0.0f; // only to satisfy the compiler
1457
1475
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1466,6 +1484,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1466
1484
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1467
1485
  int sumi = 0;
1468
1486
 
1487
+ #pragma unroll
1469
1488
  for (int i = 0; i < vdr; ++i) {
1470
1489
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
1490
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1483,12 +1502,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1483
1502
  }
1484
1503
 
1485
1504
  #ifdef GGML_CUDA_F16
1486
- const half2 tmp = __hmul2(dm5, ds8);
1487
- const float d5d8 = __half2float(tmp.x);
1488
- const float m5s8 = __half2float(tmp.y);
1505
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1506
+ const float d5d8 = tmp.x;
1507
+ const float m5s8 = tmp.y;
1489
1508
  #else
1490
- const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
- const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1509
+ const float2 dm5f = __half22float2(dm5);
1510
+ const float2 ds8f = __half22float2(ds8);
1511
+ const float d5d8 = dm5f.x * ds8f.x;
1512
+ const float m5s8 = dm5f.y * ds8f.y;
1492
1513
  #endif // GGML_CUDA_F16
1493
1514
 
1494
1515
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
@@ -1503,17 +1524,18 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1503
1524
  #define VDR_Q8_0_Q8_1_MMQ 8
1504
1525
 
1505
1526
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
- const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1527
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1507
1528
 
1508
1529
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
1530
  int sumi = 0;
1510
1531
 
1532
+ #pragma unroll
1511
1533
  for (int i = 0; i < vdr; ++i) {
1512
1534
  // SIMD dot product of quantized values
1513
1535
  sumi = __dp4a(v[i], u[i], sumi);
1514
1536
  }
1515
1537
 
1516
- return sumi * d8_0 * __half2float(ds8_1.x);
1538
+ return d8_0*d8_1 * sumi;
1517
1539
  #else
1518
1540
  return 0.0f; // only to satisfy the compiler
1519
1541
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1525,18 +1547,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1525
1547
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1526
1548
  int sumi = 0;
1527
1549
 
1550
+ #pragma unroll
1528
1551
  for (int i = 0; i < vdr; ++i) {
1529
1552
  // SIMD dot product of quantized values
1530
1553
  sumi = __dp4a(v[i], u[i], sumi);
1531
1554
  }
1532
1555
 
1533
1556
  #ifdef GGML_CUDA_F16
1534
- const half2 tmp = __hmul2(dm8, ds8);
1535
- const float d8d8 = __half2float(tmp.x);
1536
- const float m8s8 = __half2float(tmp.y);
1557
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1558
+ const float d8d8 = tmp.x;
1559
+ const float m8s8 = tmp.y;
1537
1560
  #else
1538
- const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
- const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1561
+ const float2 dm8f = __half22float2(dm8);
1562
+ const float2 ds8f = __half22float2(ds8);
1563
+ const float d8d8 = dm8f.x * ds8f.x;
1564
+ const float m8s8 = dm8f.y * ds8f.y;
1540
1565
  #endif // GGML_CUDA_F16
1541
1566
 
1542
1567
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -1546,6 +1571,312 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1546
1571
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
1572
  }
1548
1573
 
1574
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1575
+ #define VDR_Q2_K_Q8_1_MMQ 2
1576
+
1577
+ // contiguous v/x values
1578
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1579
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1580
+ const half2 & dm2, const float * __restrict__ d8) {
1581
+
1582
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1583
+ float sumf_d = 0.0f;
1584
+ float sumf_m = 0.0f;
1585
+
1586
+ #pragma unroll
1587
+ for (int i = 0; i < QR2_K; ++i) {
1588
+ const int sc = scales[2*i];
1589
+
1590
+ const int vi = (v >> (2*i)) & 0x03030303;
1591
+
1592
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1593
+
1594
+ // fill int with 4x m
1595
+ int m = sc >> 4;
1596
+ m |= m << 8;
1597
+ m |= m << 16;
1598
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1599
+ }
1600
+
1601
+ const float2 dm2f = __half22float2(dm2);
1602
+
1603
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
+ #else
1605
+ return 0.0f; // only to satisfy the compiler
1606
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
+ }
1608
+
1609
+ // contiguous u/y values
1610
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1611
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1612
+ const half2 & dm2, const float & d8) {
1613
+
1614
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1615
+ int sumi_d = 0;
1616
+ int sumi_m = 0;
1617
+
1618
+ #pragma unroll
1619
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1620
+ int sumi_d_sc = 0;
1621
+
1622
+ const int sc = scales[i0 / (QI8_1/2)];
1623
+
1624
+ // fill int with 4x m
1625
+ int m = sc >> 4;
1626
+ m |= m << 8;
1627
+ m |= m << 16;
1628
+
1629
+ #pragma unroll
1630
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1631
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1632
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1633
+ }
1634
+
1635
+ sumi_d += sumi_d_sc * (sc & 0xF);
1636
+ }
1637
+
1638
+ const float2 dm2f = __half22float2(dm2);
1639
+
1640
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
+ #else
1642
+ return 0.0f; // only to satisfy the compiler
1643
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
+ }
1645
+
1646
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1647
+ #define VDR_Q3_K_Q8_1_MMQ 2
1648
+
1649
+ // contiguous v/x values
1650
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1651
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1652
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1653
+
1654
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1655
+ float sumf = 0.0f;
1656
+
1657
+ #pragma unroll
1658
+ for (int i = 0; i < QR3_K; ++i) {
1659
+ const int isc = scale_offset + 2*i;
1660
+
1661
+ const int isc_low = isc % (QK_K/32);
1662
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1663
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1664
+
1665
+ const int isc_high = isc % (QK_K/64);
1666
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1667
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1668
+
1669
+ const int sc = (sc_low | sc_high) - 32;
1670
+
1671
+ const int vil = (vl >> (2*i)) & 0x03030303;
1672
+
1673
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1674
+
1675
+ const int vi = __vsubss4(vil, vih);
1676
+
1677
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1678
+ }
1679
+
1680
+ return d3 * sumf;
1681
+ #else
1682
+ return 0.0f; // only to satisfy the compiler
1683
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
+ }
1685
+
1686
+ // contiguous u/y values
1687
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1688
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1689
+ const float & d3, const float & d8) {
1690
+
1691
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1692
+ int sumi = 0;
1693
+
1694
+ #pragma unroll
1695
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1696
+ int sumi_sc = 0;
1697
+
1698
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1699
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1700
+ }
1701
+
1702
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1703
+ }
1704
+
1705
+ return d3*d8 * sumi;
1706
+ #else
1707
+ return 0.0f; // only to satisfy the compiler
1708
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
+ }
1710
+
1711
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1712
+ #define VDR_Q4_K_Q8_1_MMQ 8
1713
+
1714
+ // contiguous v/x values
1715
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1716
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1717
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1718
+
1719
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1720
+ float sumf_d = 0.0f;
1721
+ float sumf_m = 0.0f;
1722
+
1723
+ #pragma unroll
1724
+ for (int i = 0; i < QR4_K; ++i) {
1725
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1726
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1727
+
1728
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1729
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1730
+
1731
+ sumf_d += d8[i] * (dot1 * sc[i]);
1732
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1733
+ }
1734
+
1735
+ const float2 dm4f = __half22float2(dm4);
1736
+
1737
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
+
1739
+ #else
1740
+ return 0.0f; // only to satisfy the compiler
1741
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
+ }
1743
+
1744
+ // contiguous u/y values
1745
+ // also used for q5_K
1746
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1749
+
1750
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1751
+ float sumf_d = 0.0f;
1752
+ float sumf_m = 0.0f;
1753
+
1754
+ #pragma unroll
1755
+ for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1756
+ int sumi_d = 0;
1757
+
1758
+ #pragma unroll
1759
+ for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
+ sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
+ sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1762
+ }
1763
+
1764
+ const float2 ds8f = __half22float2(ds8[i0 / 4]);
1765
+
1766
+ sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
+ sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1768
+ }
1769
+
1770
+ const float2 dm4f = __half22float2(dm4);
1771
+
1772
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
+
1774
+ #else
1775
+ return 0.0f; // only to satisfy the compiler
1776
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
+ }
1778
+
1779
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1780
+ #define VDR_Q5_K_Q8_1_MMQ 8
1781
+
1782
+ // contiguous v/x values
1783
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1784
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
+
1787
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1788
+ float sumf_d = 0.0f;
1789
+ float sumf_m = 0.0f;
1790
+
1791
+ #pragma unroll
1792
+ for (int i = 0; i < QR5_K; ++i) {
1793
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1794
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1795
+
1796
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1797
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1798
+
1799
+ const int v0i = vl0i | vh0i;
1800
+ const int v1i = vl1i | vh1i;
1801
+
1802
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1803
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1804
+
1805
+ sumf_d += d8[i] * (dot1 * sc[i]);
1806
+ sumf_m += d8[i] * (dot2 * m[i]);
1807
+
1808
+ }
1809
+
1810
+ const float2 dm5f = __half22float2(dm5);
1811
+
1812
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
+
1814
+ #else
1815
+ return 0.0f; // only to satisfy the compiler
1816
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
+ }
1818
+
1819
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1820
+ #define VDR_Q6_K_Q8_1_MMQ 8
1821
+
1822
+ // contiguous v/x values
1823
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1824
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1825
+ const float & d, const float * __restrict__ d8) {
1826
+
1827
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1828
+ float sumf = 0.0f;
1829
+
1830
+ #pragma unroll
1831
+ for (int i = 0; i < QR6_K; ++i) {
1832
+ const int sc = scales[4*i];
1833
+
1834
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1835
+
1836
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1837
+
1838
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1839
+
1840
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1841
+ }
1842
+
1843
+ return d*sumf;
1844
+ #else
1845
+ return 0.0f; // only to satisfy the compiler
1846
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
+ }
1848
+
1849
+ // contiguous u/y values
1850
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1851
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1852
+ const float & d6, const float * __restrict__ d8) {
1853
+
1854
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1855
+ float sumf_d = 0.0f;
1856
+
1857
+ #pragma unroll
1858
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1859
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1860
+
1861
+ #pragma unroll
1862
+ for (int i = i0; i < i0 + 2; ++i) {
1863
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1864
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1865
+
1866
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1867
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1868
+ }
1869
+
1870
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1871
+ }
1872
+
1873
+ return d6 * sumf_d;
1874
+
1875
+ #else
1876
+ return 0.0f; // only to satisfy the compiler
1877
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
+ }
1879
+
1549
1880
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1550
1881
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1551
1882
 
@@ -1564,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1564
1895
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
1896
  }
1566
1897
 
1567
- static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1898
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
1899
 
1569
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1900
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1901
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1571
1902
 
1572
1903
  *x_ql = tile_x_qs;
1573
1904
  *x_dm = (half2 *) tile_x_d;
1574
1905
  }
1575
1906
 
1576
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1907
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
1908
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
1909
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
1910
 
1580
1911
  __builtin_assume(i_offset >= 0);
1581
- __builtin_assume(i_offset < 8);
1912
+ __builtin_assume(i_offset < nwarps);
1582
1913
  __builtin_assume(k >= 0);
1583
1914
  __builtin_assume(k < WARP_SIZE);
1584
1915
 
@@ -1590,7 +1921,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1590
1921
  float * x_dmf = (float *) x_dm;
1591
1922
 
1592
1923
  #pragma unroll
1593
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1924
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1594
1925
  int i = i0 + i_offset;
1595
1926
 
1596
1927
  if (need_check) {
@@ -1600,38 +1931,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1600
1931
  const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
1932
 
1602
1933
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1934
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1604
1935
  }
1605
1936
 
1606
- // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
- // const int kbxd = k % blocks_per_tile_x_row;
1937
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1938
+ const int kbxd = k % blocks_per_tile_x_row;
1608
1939
 
1609
- // #pragma unroll
1610
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
- // FIXME out-of-bounds
1612
- // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1940
+ #pragma unroll
1941
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1942
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
1943
 
1614
- // if (i >= GGML_CUDA_MMQ_Y) {
1615
- // return;
1616
- // }
1944
+ if (need_check) {
1945
+ i = min(i, i_max);
1946
+ }
1617
1947
 
1618
- // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1948
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
1949
 
1620
- // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
- // }
1950
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1951
+ }
1622
1952
  }
1623
1953
 
1624
1954
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
1955
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
1956
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1627
1957
 
1628
- __builtin_assume(i >= 0);
1629
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
- __builtin_assume(j >= 0);
1631
- __builtin_assume(j < WARP_SIZE);
1632
- __builtin_assume(k >= 0);
1633
- __builtin_assume(k < WARP_SIZE);
1634
-
1635
1958
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
1959
  const float * x_dmf = (float *) x_dm;
1637
1960
 
@@ -1639,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1639
1962
 
1640
1963
  #pragma unroll
1641
1964
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
1965
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
1966
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1644
1967
  }
1645
1968
 
1646
1969
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
1970
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1971
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1649
1972
  }
1650
1973
 
1651
1974
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -1666,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1666
1989
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
1990
  }
1668
1991
 
1669
- static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1992
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1670
1993
 
1671
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
1994
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
1995
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1673
1996
 
1674
1997
  *x_ql = tile_x_qs;
1675
1998
  *x_dm = tile_x_dm;
1676
1999
  }
1677
2000
 
1678
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2001
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
2002
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
2003
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
2004
 
1682
2005
  __builtin_assume(i_offset >= 0);
1683
- __builtin_assume(i_offset < 8);
2006
+ __builtin_assume(i_offset < nwarps);
1684
2007
  __builtin_assume(k >= 0);
1685
2008
  __builtin_assume(k < WARP_SIZE);
1686
2009
 
@@ -1690,7 +2013,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1690
2013
  const block_q4_1 * bx0 = (block_q4_1 *) vx;
1691
2014
 
1692
2015
  #pragma unroll
1693
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2016
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1694
2017
  int i = i0 + i_offset;
1695
2018
 
1696
2019
  if (need_check) {
@@ -1706,7 +2029,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1706
2029
  const int kbxd = k % blocks_per_tile_x_row;
1707
2030
 
1708
2031
  #pragma unroll
1709
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
2032
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
1710
2033
  int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
2034
 
1712
2035
  if (need_check) {
@@ -1723,26 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
2046
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
2047
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1725
2048
 
1726
- __builtin_assume(i >= 0);
1727
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
- __builtin_assume(j >= 0);
1729
- __builtin_assume(j < WARP_SIZE);
1730
- __builtin_assume(k >= 0);
1731
- __builtin_assume(k < WARP_SIZE);
1732
-
1733
2049
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1734
2050
 
1735
2051
  int u[2*VDR_Q4_1_Q8_1_MMQ];
1736
2052
 
1737
2053
  #pragma unroll
1738
2054
  for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
2055
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2056
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
1741
2057
  }
1742
2058
 
1743
2059
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
2060
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2061
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1746
2062
  }
1747
2063
 
1748
2064
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -1765,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1765
2081
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
2082
  }
1767
2083
 
1768
- static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2084
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
2085
 
1770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
2086
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2087
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
1772
2088
 
1773
2089
  *x_ql = tile_x_ql;
1774
2090
  *x_dm = (half2 *) tile_x_d;
1775
2091
  }
1776
2092
 
1777
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2093
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
2094
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
2095
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
2096
 
1781
2097
  __builtin_assume(i_offset >= 0);
1782
- __builtin_assume(i_offset < 8);
2098
+ __builtin_assume(i_offset < nwarps);
1783
2099
  __builtin_assume(k >= 0);
1784
2100
  __builtin_assume(k < WARP_SIZE);
1785
2101
 
@@ -1789,7 +2105,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1789
2105
  const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
2106
 
1791
2107
  #pragma unroll
1792
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2108
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1793
2109
  int i = i0 + i_offset;
1794
2110
 
1795
2111
  if (need_check) {
@@ -1825,7 +2141,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1825
2141
  float * x_dmf = (float *) x_dm;
1826
2142
 
1827
2143
  #pragma unroll
1828
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
2144
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
1829
2145
  int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
2146
 
1831
2147
  if (need_check) {
@@ -1842,27 +2158,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
2158
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
2159
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
2160
 
1845
- __builtin_assume(i >= 0);
1846
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
- __builtin_assume(j >= 0);
1848
- __builtin_assume(j < WARP_SIZE);
1849
- __builtin_assume(k >= 0);
1850
- __builtin_assume(k < WARP_SIZE);
1851
-
1852
2161
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
2162
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
- const float * x_dmf = (float *) x_dm;
2163
+ const float * x_dmf = (const float *) x_dm;
2164
+ const float * y_df = (const float *) y_ds;
1855
2165
 
1856
2166
  int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
2167
 
1858
2168
  #pragma unroll
1859
2169
  for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
2170
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2171
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
1862
2172
  }
1863
2173
 
1864
2174
  return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2175
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1866
2176
  }
1867
2177
 
1868
2178
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -1885,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1885
2195
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
2196
  }
1887
2197
 
1888
- static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2198
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
2199
 
1890
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
2200
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2201
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
1892
2202
 
1893
2203
  *x_ql = tile_x_ql;
1894
2204
  *x_dm = tile_x_dm;
1895
2205
  }
1896
2206
 
1897
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2207
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
2208
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
2209
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
2210
 
1901
2211
  __builtin_assume(i_offset >= 0);
1902
- __builtin_assume(i_offset < 8);
2212
+ __builtin_assume(i_offset < nwarps);
1903
2213
  __builtin_assume(k >= 0);
1904
2214
  __builtin_assume(k < WARP_SIZE);
1905
2215
 
@@ -1909,7 +2219,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1909
2219
  const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
2220
 
1911
2221
  #pragma unroll
1912
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2222
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1913
2223
  int i = i0 + i_offset;
1914
2224
 
1915
2225
  if (need_check) {
@@ -1942,7 +2252,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1942
2252
  const int kbxd = k % blocks_per_tile_x_row;
1943
2253
 
1944
2254
  #pragma unroll
1945
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
2255
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
1946
2256
  int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
2257
 
1948
2258
  if (need_check) {
@@ -1959,13 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
2269
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
2270
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
2271
 
1962
- __builtin_assume(i >= 0);
1963
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
- __builtin_assume(j >= 0);
1965
- __builtin_assume(j < WARP_SIZE);
1966
- __builtin_assume(k >= 0);
1967
- __builtin_assume(k < WARP_SIZE);
1968
-
1969
2272
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
2273
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
2274
 
@@ -1973,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1973
2276
 
1974
2277
  #pragma unroll
1975
2278
  for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
2279
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2280
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
1978
2281
  }
1979
2282
 
1980
2283
  return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2284
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1982
2285
  }
1983
2286
 
1984
2287
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -1989,29 +2292,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1989
2292
  int v[VDR_Q8_0_Q8_1_MMVQ];
1990
2293
  int u[VDR_Q8_0_Q8_1_MMVQ];
1991
2294
 
2295
+ #pragma unroll
1992
2296
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
2297
  v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
2298
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
2299
  }
1996
2300
 
1997
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
2301
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
1998
2302
  }
1999
2303
 
2000
- static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2304
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
2305
 
2002
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2306
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2307
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2004
2308
 
2005
2309
  *x_ql = tile_x_qs;
2006
2310
  *x_dm = (half2 *) tile_x_d;
2007
2311
  }
2008
2312
 
2009
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2313
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
2314
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
2315
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
2316
 
2013
2317
  __builtin_assume(i_offset >= 0);
2014
- __builtin_assume(i_offset < 8);
2318
+ __builtin_assume(i_offset < nwarps);
2015
2319
  __builtin_assume(k >= 0);
2016
2320
  __builtin_assume(k < WARP_SIZE);
2017
2321
 
@@ -2022,7 +2326,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2022
2326
  const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
2327
 
2024
2328
  #pragma unroll
2025
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2329
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2026
2330
  int i = i0 + i_offset;
2027
2331
 
2028
2332
  if (need_check) {
@@ -2032,76 +2336,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2032
2336
  const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
2337
 
2034
2338
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
2339
  }
2037
2340
 
2038
- // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
- // const int kbxd = k % blocks_per_tile_x_row;
2341
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2342
+ const int kbxd = k % blocks_per_tile_x_row;
2040
2343
 
2041
- // #pragma unroll
2042
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
- // FIXME out-of-bounds
2044
- // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2344
+ #pragma unroll
2345
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2346
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
2347
 
2046
- // #if GGML_CUDA_MMQ_Y < 64
2047
- // if (i >= GGML_CUDA_MMQ_Y) {
2048
- // return;
2049
- // }
2050
- // #endif // GGML_CUDA_MMQ_Y < 64
2348
+ if (need_check) {
2349
+ i = min(i, i_max);
2350
+ }
2051
2351
 
2052
- // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2352
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
2353
 
2054
- // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
- // }
2354
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2355
+ }
2056
2356
  }
2057
2357
 
2058
2358
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
2359
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
2360
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
2361
 
2062
- __builtin_assume(i >= 0);
2063
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
- __builtin_assume(j >= 0);
2065
- __builtin_assume(j < WARP_SIZE);
2066
- __builtin_assume(k >= 0);
2067
- __builtin_assume(k < WARP_SIZE);
2068
-
2069
- const float * x_dmf = (float *) x_dm;
2362
+ const float * x_dmf = (const float *) x_dm;
2363
+ const float * y_df = (const float *) y_ds;
2070
2364
 
2071
2365
  return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
2366
  (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
- y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
- }
2075
-
2076
- #define VDR_q2_K_q8_1 1
2077
-
2078
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
- const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
- const half2 & dm, const float * __restrict__ d8) {
2081
-
2082
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
- float sumf_d = 0.0f;
2084
- float sumf_m = 0.0f;
2085
-
2086
- for (int i = 0; i < QR2_K; ++i) {
2087
- const int sc = scales[2*i];
2088
-
2089
- const int vi = (v >> (2*i)) & 0x03030303;
2090
-
2091
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
-
2093
- int sc_high = sc >> 4;
2094
- sc_high |= sc_high << 8;
2095
- sc_high |= sc_high << 16;
2096
- sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
- }
2098
-
2099
- const float2 dmf = __half22float2(dm);
2100
-
2101
- return dmf.x*sumf_d - dmf.y*sumf_m;
2102
- #else
2103
- return 0.0f; // only to satisfy the compiler
2104
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2367
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2105
2368
  }
2106
2369
 
2107
2370
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
@@ -2115,34 +2378,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2115
2378
  const uint8_t * scales = bq2_K->scales + scale_offset;
2116
2379
 
2117
2380
  const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
- int u[QR2_K];
2381
+ int u[QR2_K];
2119
2382
  float d8[QR2_K];
2120
2383
 
2384
+ #pragma unroll
2121
2385
  for (int i = 0; i < QR2_K; ++ i) {
2122
2386
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
2387
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
2388
  }
2125
2389
 
2126
- return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2390
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2127
2391
  }
2128
2392
 
2129
- static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2393
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
2394
 
2131
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2395
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2396
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2397
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2134
2398
 
2135
2399
  *x_ql = tile_x_ql;
2136
2400
  *x_dm = tile_x_dm;
2137
2401
  *x_sc = tile_x_sc;
2138
2402
  }
2139
2403
 
2140
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
2405
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
2406
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
2407
 
2144
2408
  __builtin_assume(i_offset >= 0);
2145
- __builtin_assume(i_offset < 8);
2409
+ __builtin_assume(i_offset < nwarps);
2146
2410
  __builtin_assume(k >= 0);
2147
2411
  __builtin_assume(k < WARP_SIZE);
2148
2412
 
@@ -2152,7 +2416,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2152
2416
  const block_q2_K * bx0 = (block_q2_K *) vx;
2153
2417
 
2154
2418
  #pragma unroll
2155
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2419
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2156
2420
  int i = i0 + i_offset;
2157
2421
 
2158
2422
  if (need_check) {
@@ -2168,8 +2432,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2168
2432
  const int kbxd = k % blocks_per_tile_x_row;
2169
2433
 
2170
2434
  #pragma unroll
2171
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2435
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2436
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2173
2437
 
2174
2438
  if (need_check) {
2175
2439
  i = min(i, i_max);
@@ -2181,7 +2445,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2181
2445
  }
2182
2446
 
2183
2447
  #pragma unroll
2184
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2448
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2185
2449
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
2450
 
2187
2451
  if (need_check) {
@@ -2198,68 +2462,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
2462
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
2463
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
2464
 
2201
- __builtin_assume(i >= 0);
2202
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
- __builtin_assume(j >= 0);
2204
- __builtin_assume(j < WARP_SIZE);
2205
- __builtin_assume(k >= 0);
2206
- __builtin_assume(k < WARP_SIZE);
2207
-
2208
- const int kbx = k / QI2_K;
2209
- const int kqsx = k % QI2_K;
2465
+ const int kbx = k / QI2_K;
2466
+ const int ky = (k % QI2_K) * QR2_K;
2467
+ const float * y_df = (const float *) y_ds;
2210
2468
 
2211
- const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2469
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2213
2470
 
2214
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2471
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2472
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2215
2473
 
2216
- int u[QR2_K];
2217
- float d8[QR2_K];
2218
-
2219
- for (int l = 0; l < QR2_K; ++ l) {
2220
- const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
- u[l] = y_qs[y_qs_index];
2222
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2474
+ #pragma unroll
2475
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2476
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2223
2477
  }
2224
2478
 
2225
- return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
- }
2227
-
2228
- #define VDR_q3_K_q8_1 1
2229
-
2230
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
- const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
- const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
-
2234
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
- float sumf = 0.0f;
2236
-
2237
- for (int i = 0; i < QR3_K; ++i) {
2238
- const int isc = scale_offset + 2*i;
2479
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2239
2480
 
2240
- const int isc_low = isc % (QK_K/32);
2241
- const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
- const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
-
2244
- const int isc_high = isc % (QK_K/64);
2245
- const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
- const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
-
2248
- const int sc = (sc_low | sc_high) - 32;
2249
-
2250
- const int vil = (vl >> (2*i)) & 0x03030303;
2251
-
2252
- const int vih = ((vh >> i) << 2) & 0x04040404;
2253
-
2254
- const int vi = __vsubss4(vil, vih);
2255
-
2256
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
- }
2258
-
2259
- return d*sumf;
2260
- #else
2261
- return 0.0f; // only to satisfy the compiler
2262
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2481
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2482
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2263
2483
  }
2264
2484
 
2265
2485
  static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
@@ -2277,23 +2497,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2277
2497
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
2498
  const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
2499
 
2280
- int u[QR3_K];
2500
+ int u[QR3_K];
2281
2501
  float d8[QR3_K];
2282
2502
 
2503
+ #pragma unroll
2283
2504
  for (int i = 0; i < QR3_K; ++i) {
2284
2505
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
2506
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
2507
  }
2287
2508
 
2288
- return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2509
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
2510
  }
2290
2511
 
2291
- static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2512
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
2513
 
2293
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2514
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2515
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2516
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2517
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2297
2518
 
2298
2519
  *x_ql = tile_x_ql;
2299
2520
  *x_dm = tile_x_dm;
@@ -2301,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
2301
2522
  *x_sc = tile_x_sc;
2302
2523
  }
2303
2524
 
2304
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2525
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
2526
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
2527
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
2528
 
2308
2529
  __builtin_assume(i_offset >= 0);
2309
- __builtin_assume(i_offset < 8);
2530
+ __builtin_assume(i_offset < nwarps);
2310
2531
  __builtin_assume(k >= 0);
2311
2532
  __builtin_assume(k < WARP_SIZE);
2312
2533
 
@@ -2316,7 +2537,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2316
2537
  const block_q3_K * bx0 = (block_q3_K *) vx;
2317
2538
 
2318
2539
  #pragma unroll
2319
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2540
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2320
2541
  int i = i0 + i_offset;
2321
2542
 
2322
2543
  if (need_check) {
@@ -2330,10 +2551,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2330
2551
 
2331
2552
  const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
2553
  const int kbxd = k % blocks_per_tile_x_row;
2554
+ float * x_dmf = (float *) x_dm;
2333
2555
 
2334
2556
  #pragma unroll
2335
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2557
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2558
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2337
2559
 
2338
2560
  if (need_check) {
2339
2561
  i = min(i, i_max);
@@ -2341,11 +2563,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2341
2563
 
2342
2564
  const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
2565
 
2344
- x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2566
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2345
2567
  }
2346
2568
 
2347
2569
  #pragma unroll
2348
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2570
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2349
2571
  int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
2572
 
2351
2573
  if (need_check) {
@@ -2354,11 +2576,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2354
2576
 
2355
2577
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
2578
 
2357
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2579
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2580
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
2581
  }
2359
2582
 
2360
2583
  #pragma unroll
2361
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2584
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2362
2585
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
2586
 
2364
2587
  if (need_check) {
@@ -2367,7 +2590,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2367
2590
 
2368
2591
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
2592
 
2370
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2593
+ const int ksc = k % (QI3_K/4);
2594
+
2595
+ const int ksc_low = ksc % (QI3_K/8);
2596
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2597
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2598
+
2599
+ const int ksc_high = QI3_K/8;
2600
+ const int shift_high = 2 * ksc;
2601
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2602
+
2603
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2604
+
2605
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2371
2606
  }
2372
2607
  }
2373
2608
 
@@ -2375,63 +2610,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
2610
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2611
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
2612
 
2378
- __builtin_assume(i >= 0);
2379
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
- __builtin_assume(j >= 0);
2381
- __builtin_assume(j < WARP_SIZE);
2382
- __builtin_assume(k >= 0);
2383
- __builtin_assume(k < WARP_SIZE);
2384
-
2385
2613
  const int kbx = k / QI3_K;
2386
- const int kqsx = k % QI3_K;
2387
-
2388
- const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2390
-
2391
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2614
+ const int ky = (k % QI3_K) * QR3_K;
2615
+ const float * x_dmf = (const float *) x_dm;
2616
+ const float * y_df = (const float *) y_ds;
2392
2617
 
2393
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
- const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2618
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2395
2619
 
2396
- int u[QR3_K];
2397
- float d8[QR3_K];
2398
-
2399
- for (int l = 0; l < QR3_K; ++ l) {
2400
- const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
- u[l] = y_qs[y_qs_index];
2402
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
- }
2404
-
2405
- return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
- x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
- }
2408
-
2409
- #define VDR_q4_K_q8_1 2
2410
-
2411
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
- const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
- const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
-
2415
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
- float sumf_d = 0.0f;
2417
- float sumf_m = 0.0f;
2620
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2418
2621
 
2419
- for (int i = 0; i < QR4_K; ++i) {
2420
- const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
- const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2622
+ #pragma unroll
2623
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2624
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2625
+ const int shift = 2 * ((ky % 32) / 8);
2626
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2422
2627
 
2423
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2628
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2629
+ const int vlh = (vh << 2) & 0x04040404;
2425
2630
 
2426
- sumf_d += d8[i] * (dot1 * sc[i]);
2427
- sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2631
+ v[l] = __vsubss4(vll, vlh);
2428
2632
  }
2429
2633
 
2430
- return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
-
2432
- #else
2433
- return 0.0f; // only to satisfy the compiler
2434
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2634
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2635
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2435
2636
  }
2436
2637
 
2437
2638
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
@@ -2478,7 +2679,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2478
2679
  u[2*i+1] = q8[4];
2479
2680
  }
2480
2681
 
2481
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2682
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2482
2683
 
2483
2684
  #else
2484
2685
 
@@ -2527,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2527
2728
  #endif
2528
2729
  }
2529
2730
 
2530
- static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2731
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
2732
 
2532
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2733
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2734
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2735
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2535
2736
 
2536
2737
  *x_ql = tile_x_ql;
2537
2738
  *x_dm = tile_x_dm;
2538
2739
  *x_sc = tile_x_sc;
2539
2740
  }
2540
2741
 
2541
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2742
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
2743
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
2744
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
2745
 
2545
2746
  __builtin_assume(i_offset >= 0);
2546
- __builtin_assume(i_offset < 8);
2747
+ __builtin_assume(i_offset < nwarps);
2547
2748
  __builtin_assume(k >= 0);
2548
2749
  __builtin_assume(k < WARP_SIZE);
2549
2750
 
@@ -2553,7 +2754,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2553
2754
  const block_q4_K * bx0 = (block_q4_K *) vx;
2554
2755
 
2555
2756
  #pragma unroll
2556
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2757
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2557
2758
  int i = i0 + i_offset;
2558
2759
 
2559
2760
  if (need_check) {
@@ -2563,118 +2764,62 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2563
2764
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2564
2765
 
2565
2766
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2566
- }
2567
-
2568
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
-
2571
- #pragma unroll
2572
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2574
-
2575
- if (need_check) {
2576
- i = min(i, i_max);
2577
- }
2578
-
2579
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2580
-
2581
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2582
- }
2583
-
2584
- #pragma unroll
2585
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2587
-
2588
- if (need_check) {
2589
- i = min(i, i_max);
2590
- }
2591
-
2592
- const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
-
2594
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
- }
2596
- }
2597
-
2598
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
-
2602
- __builtin_assume(i >= 0);
2603
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
- __builtin_assume(j >= 0);
2605
- __builtin_assume(j < WARP_SIZE);
2606
- __builtin_assume(k >= 0);
2607
- __builtin_assume(k < WARP_SIZE);
2608
-
2609
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
-
2612
- int v[2];
2613
- int u[2*QR4_K];
2614
- float d8[QR4_K];
2767
+ }
2615
2768
 
2616
- // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
- const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2769
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2770
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2618
2771
 
2619
- v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
- v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2772
+ #pragma unroll
2773
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2774
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2621
2775
 
2622
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
- uint16_t aux[2];
2624
- const int l = bq8_offset/2;
2625
- if (l < 2) {
2626
- aux[0] = scales[l+0] & 0x3f3f;
2627
- aux[1] = scales[l+2] & 0x3f3f;
2628
- } else {
2629
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2631
- }
2632
- const uint8_t * sc = (const uint8_t *)aux;
2633
- const uint8_t * m = sc + 2;
2776
+ if (need_check) {
2777
+ i = min(i, i_max);
2778
+ }
2779
+
2780
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2634
2781
 
2635
- for (int l = 0; l < QR4_K; ++l) {
2636
- const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
- d8[l] = y_ds[kqsy / QI8_1].x;
2782
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2640
2783
  }
2641
2784
 
2642
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
- }
2785
+ #pragma unroll
2786
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2787
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2644
2788
 
2645
- #define VDR_q5_K_q8_1 2
2789
+ if (need_check) {
2790
+ i = min(i, i_max);
2791
+ }
2646
2792
 
2647
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
- const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
- const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2793
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2650
2794
 
2651
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
- float sumf_d = 0.0f;
2653
- float sumf_m = 0.0f;
2795
+ const int * scales = (int *) bxi->scales;
2654
2796
 
2655
- for (int i = 0; i < QR5_K; ++i) {
2656
- const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
- const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2797
+ const int ksc = k % (WARP_SIZE/8);
2658
2798
 
2659
- const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
- const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2799
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2800
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2801
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2661
2802
 
2662
- const int v0i = vl0i | vh0i;
2663
- const int v1i = vl1i | vh1i;
2803
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2804
+ }
2805
+ }
2664
2806
 
2665
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2807
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2667
2810
 
2668
- sumf_d += d8[i] * (dot1 * sc[i]);
2669
- sumf_m += d8[i] * (dot2 * m[i]);
2811
+ int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2670
2812
 
2813
+ #pragma unroll
2814
+ for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
+ v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
+ v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2671
2817
  }
2672
2818
 
2673
- return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2819
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2674
2820
 
2675
- #else
2676
- return 0.0f; // only to satisfy the compiler
2677
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2821
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
+ return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2678
2823
  }
2679
2824
 
2680
2825
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2711,6 +2856,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2711
2856
  const uint8_t * sc = (const uint8_t *)aux;
2712
2857
  const uint8_t * m = sc + 2;
2713
2858
 
2859
+ #pragma unroll
2714
2860
  for (int i = 0; i < QR5_K; ++i) {
2715
2861
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
2862
  d8[i] = bq8i->ds.x;
@@ -2765,25 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2765
2911
  #endif
2766
2912
  }
2767
2913
 
2768
- static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2914
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
2915
 
2770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2916
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2917
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2918
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2774
2919
 
2775
2920
  *x_ql = tile_x_ql;
2776
2921
  *x_dm = tile_x_dm;
2777
- *x_qh = tile_x_qh;
2778
2922
  *x_sc = tile_x_sc;
2779
2923
  }
2780
2924
 
2781
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2925
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
2926
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
2927
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
2928
 
2785
2929
  __builtin_assume(i_offset >= 0);
2786
- __builtin_assume(i_offset < 8);
2930
+ __builtin_assume(i_offset < nwarps);
2787
2931
  __builtin_assume(k >= 0);
2788
2932
  __builtin_assume(k < WARP_SIZE);
2789
2933
 
@@ -2793,7 +2937,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2793
2937
  const block_q5_K * bx0 = (block_q5_K *) vx;
2794
2938
 
2795
2939
  #pragma unroll
2796
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2940
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2797
2941
  int i = i0 + i_offset;
2798
2942
 
2799
2943
  if (need_check) {
@@ -2801,16 +2945,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2801
2945
  }
2802
2946
 
2803
2947
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2948
+ const int ky = QR5_K*kqsx;
2804
2949
 
2805
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2950
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2951
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2952
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2953
+
2954
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2955
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2956
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2957
+
2958
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
2959
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
2960
+
2961
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
2962
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2806
2963
  }
2807
2964
 
2808
2965
  const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2966
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
2967
 
2811
2968
  #pragma unroll
2812
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2969
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
2970
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2814
2971
 
2815
2972
  if (need_check) {
2816
2973
  i = min(i, i_max);
@@ -2822,29 +2979,24 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2822
2979
  }
2823
2980
 
2824
2981
  #pragma unroll
2825
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2983
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2827
2984
 
2828
2985
  if (need_check) {
2829
2986
  i = min(i, i_max);
2830
2987
  }
2831
2988
 
2832
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
-
2834
- x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
- }
2989
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2836
2990
 
2837
- #pragma unroll
2838
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2991
+ const int * scales = (int *) bxi->scales;
2840
2992
 
2841
- if (need_check) {
2842
- i = min(i, i_max);
2843
- }
2993
+ const int ksc = k % (WARP_SIZE/8);
2844
2994
 
2845
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2995
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2996
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2997
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2846
2998
 
2847
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2999
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2848
3000
  }
2849
3001
  }
2850
3002
 
@@ -2852,77 +3004,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
3004
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
3005
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
3006
 
2855
- __builtin_assume(i >= 0);
2856
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
- __builtin_assume(j >= 0);
2858
- __builtin_assume(j < WARP_SIZE);
2859
- __builtin_assume(k >= 0);
2860
- __builtin_assume(k < WARP_SIZE);
2861
-
2862
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
-
2865
- int vl[2];
2866
- int vh[2];
2867
- int u[2*QR4_K];
2868
- float d8[QR4_K];
2869
-
2870
- const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
-
2872
- vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
- vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
-
2875
- vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
- vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
-
2878
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
- uint16_t aux[2];
2880
- const int l = bq8_offset/2;
2881
- if (l < 2) {
2882
- aux[0] = scales[l+0] & 0x3f3f;
2883
- aux[1] = scales[l+2] & 0x3f3f;
2884
- } else {
2885
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
- }
2888
- const uint8_t * sc = (const uint8_t *)aux;
2889
- const uint8_t * m = sc + 2;
2890
-
2891
- for (int l = 0; l < QR5_K; ++l) {
2892
- const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
- d8[l] = y_ds[kqsy / QI8_1].x;
2896
- }
2897
-
2898
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
- }
2900
-
2901
- #define VDR_q6_K_q8_1 1
2902
-
2903
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
- const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
- const float & d, const float * __restrict__ d8) {
2906
-
2907
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
- float sumf = 0.0f;
2909
-
2910
- for (int i = 0; i < QR6_K; ++i) {
2911
- const int sc = scales[4*i];
2912
-
2913
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
2914
-
2915
- const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
2916
-
2917
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
2918
-
2919
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2920
- }
3007
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
2921
3008
 
2922
- return d*sumf;
2923
- #else
2924
- return 0.0f; // only to satisfy the compiler
2925
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3009
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
2926
3012
  }
2927
3013
 
2928
3014
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -2942,33 +3028,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2942
3028
  int u[QR6_K];
2943
3029
  float d8[QR6_K];
2944
3030
 
3031
+ #pragma unroll
2945
3032
  for (int i = 0; i < QR6_K; ++i) {
2946
3033
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
3034
  d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
2948
3035
  }
2949
3036
 
2950
- return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
3037
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
2951
3038
  }
2952
3039
 
2953
- static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3040
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2954
3041
 
2955
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
3042
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3043
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3044
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2959
3045
 
2960
3046
  *x_ql = tile_x_ql;
2961
3047
  *x_dm = tile_x_dm;
2962
- *x_qh = tile_x_qh;
2963
3048
  *x_sc = tile_x_sc;
2964
3049
  }
2965
3050
 
2966
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3051
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
3052
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
3053
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2969
3054
 
2970
3055
  __builtin_assume(i_offset >= 0);
2971
- __builtin_assume(i_offset < 8);
3056
+ __builtin_assume(i_offset < nwarps);
2972
3057
  __builtin_assume(k >= 0);
2973
3058
  __builtin_assume(k < WARP_SIZE);
2974
3059
 
@@ -2978,7 +3063,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2978
3063
  const block_q6_K * bx0 = (block_q6_K *) vx;
2979
3064
 
2980
3065
  #pragma unroll
2981
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
3066
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2982
3067
  int i = i0 + i_offset;
2983
3068
 
2984
3069
  if (need_check) {
@@ -2986,42 +3071,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2986
3071
  }
2987
3072
 
2988
3073
  const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3074
+ const int ky = QR6_K*kqsx;
2989
3075
 
2990
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
- }
2992
-
2993
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2995
-
2996
- #pragma unroll
2997
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3076
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3077
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3078
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2999
3079
 
3000
- if (need_check) {
3001
- i = min(i, i_max);
3002
- }
3080
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3081
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3082
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3003
3083
 
3004
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3084
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3085
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3005
3086
 
3006
- x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3087
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3088
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3007
3089
  }
3008
3090
 
3091
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3092
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3093
+ float * x_dmf = (float *) x_dm;
3094
+
3009
3095
  #pragma unroll
3010
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
3096
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3097
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3012
3098
 
3013
3099
  if (need_check) {
3014
3100
  i = min(i, i_max);
3015
3101
  }
3016
3102
 
3017
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
3103
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3018
3104
 
3019
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3105
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3020
3106
  }
3021
3107
 
3022
3108
  #pragma unroll
3023
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3110
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3025
3111
 
3026
3112
  if (need_check) {
3027
3113
  i = min(i, i_max);
@@ -3037,39 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
3123
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
3124
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3039
3125
 
3040
- __builtin_assume(i >= 0);
3041
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
- __builtin_assume(j >= 0);
3043
- __builtin_assume(j < WARP_SIZE);
3044
- __builtin_assume(k >= 0);
3045
- __builtin_assume(k < WARP_SIZE);
3046
-
3047
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
- const int kqsx = k % QI6_K; // == k if QK_K == 256
3049
-
3050
- const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
- const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
- const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
3053
-
3054
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
3055
-
3056
- const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
- const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
3126
+ const float * x_dmf = (const float *) x_dm;
3127
+ const float * y_df = (const float *) y_ds;
3058
3128
 
3059
- int u[QR6_K];
3060
- float d8[QR6_K];
3061
-
3062
- for (int l = 0; l < QR6_K; ++l) {
3063
- const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
- u[l] = y_qs[kqsy];
3065
- d8[l] = y_ds[kqsy / QI8_1].x;
3066
- }
3129
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3067
3130
 
3068
- return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
- x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3131
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3132
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3133
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3070
3134
  }
3071
3135
 
3072
- template <int qk, int qr, int qi, typename block_q_t,
3136
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3073
3137
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
3138
  static __global__ void mul_mat_q(
3075
3139
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -3084,14 +3148,11 @@ static __global__ void mul_mat_q(
3084
3148
 
3085
3149
  const int & ncols_dst = ncols_y;
3086
3150
 
3087
- const int tid_x = threadIdx.x;
3088
- const int tid_y = threadIdx.y;
3089
-
3090
- const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3151
+ const int row_dst_0 = blockIdx.x*mmq_y;
3091
3152
  const int & row_x_0 = row_dst_0;
3092
- const int row_dst = row_dst_0 + tid_x;
3153
+ const int row_dst = row_dst_0 + threadIdx.x;
3093
3154
 
3094
- const int col_dst_0 = blockIdx.y*WARP_SIZE;
3155
+ const int col_dst_0 = blockIdx.y*mmq_x;
3095
3156
  const int & col_y_0 = col_dst_0;
3096
3157
 
3097
3158
  int * tile_x_ql = nullptr;
@@ -3101,55 +3162,65 @@ static __global__ void mul_mat_q(
3101
3162
 
3102
3163
  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3103
3164
 
3104
- const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
3105
-
3106
- __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
- __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
3165
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3166
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3108
3167
 
3109
- float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
3168
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3110
3169
 
3111
3170
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3112
3171
 
3113
3172
  load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
- tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
3173
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3115
3174
 
3175
+ #pragma unroll
3116
3176
  for (int ir = 0; ir < qr; ++ir) {
3117
- const int kqs = ir*WARP_SIZE + tid_x;
3177
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3118
3178
  const int kbxd = kqs / QI8_1;
3119
3179
 
3120
- for (int i = 0; i < WARP_SIZE; i += 8) {
3121
- const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3180
+ #pragma unroll
3181
+ for (int i = 0; i < mmq_x; i += nwarps) {
3182
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3122
3183
 
3123
3184
  const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
3124
3185
 
3125
- tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3186
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3187
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3126
3188
  }
3127
- }
3128
3189
 
3129
- for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
- const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
- const int kby = tid_x % blocks_per_tile_y_col;
3132
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
- tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
- }
3190
+ #pragma unroll
3191
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3192
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3193
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3194
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3195
+
3196
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3197
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3198
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3199
+ if (need_sum) {
3200
+ *dsi_dst = *dsi_src;
3201
+ } else {
3202
+ float * dfi_dst = (float *) dsi_dst;
3203
+ *dfi_dst = (*dsi_src).x;
3204
+ }
3205
+ }
3135
3206
 
3136
- __syncthreads();
3207
+ __syncthreads();
3137
3208
 
3138
- #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
- #pragma unroll
3140
- #endif // __CUDA_ARCH__ >= 700
3141
- for (int k = 0; k < WARP_SIZE; k += vdr) {
3209
+ // #pragma unroll // unrolling this loop causes too much register pressure
3210
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3142
3211
  #pragma unroll
3143
- for (int j = 0; j < WARP_SIZE; j += 8) {
3212
+ for (int j = 0; j < mmq_x; j += nwarps) {
3144
3213
  #pragma unroll
3145
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
- sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
- tid_x + i, tid_y + j, k);
3214
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3215
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3216
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3217
+ threadIdx.x + i, threadIdx.y + j, k);
3218
+ }
3148
3219
  }
3149
3220
  }
3150
- }
3151
3221
 
3152
- __syncthreads();
3222
+ __syncthreads();
3223
+ }
3153
3224
  }
3154
3225
 
3155
3226
 
@@ -3157,15 +3228,15 @@ static __global__ void mul_mat_q(
3157
3228
  return;
3158
3229
  }
3159
3230
 
3160
- for (int j = 0; j < WARP_SIZE; j += 8) {
3161
- const int col_dst = col_dst_0 + j + tid_y;
3231
+ for (int j = 0; j < mmq_x; j += nwarps) {
3232
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3162
3233
 
3163
3234
  if (col_dst >= ncols_dst) {
3164
3235
  return;
3165
3236
  }
3166
3237
 
3167
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3238
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3169
3240
  }
3170
3241
  }
3171
3242
  }
@@ -3780,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
3780
3851
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3781
3852
  const dim3 block_nums(1, block_num_y, 1);
3782
3853
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3783
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
3854
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
3784
3855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3785
3856
  }
3786
3857
 
@@ -3789,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
3789
3860
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3790
3861
  const dim3 block_nums(1, block_num_y, 1);
3791
3862
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3792
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
3863
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
3793
3864
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3794
3865
  }
3795
3866
 
@@ -3798,7 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
3798
3869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3799
3870
  const dim3 block_nums(1, block_num_y, 1);
3800
3871
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3801
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
3872
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
3802
3873
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3803
3874
  }
3804
3875
 
@@ -3807,7 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
3807
3878
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3808
3879
  const dim3 block_nums(1, block_num_y, 1);
3809
3880
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3810
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
3881
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
3811
3882
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3812
3883
  }
3813
3884
 
@@ -3816,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
3816
3887
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3817
3888
  const dim3 block_nums(1, block_num_y, 1);
3818
3889
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3819
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
3890
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
3820
3891
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3821
3892
  }
3822
3893
 
@@ -3867,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
3938
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
3939
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
3940
 
3870
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3872
- const dim3 block_nums(block_num_x, block_num_y, 1);
3873
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3874
-
3875
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3941
+ int id;
3942
+ CUDA_CHECK(cudaGetDevice(&id));
3943
+ const int compute_capability = g_compute_capabilities[id];
3944
+
3945
+ if (compute_capability >= CC_TURING) {
3946
+ const int mmq_x = 64;
3947
+ const int mmq_y = 128;
3948
+ const int nwarps = 4;
3949
+
3950
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3953
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
+
3955
+ if (nrows_x % mmq_y == 0) {
3956
+ const bool need_check = false;
3957
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
+ } else {
3961
+ const bool need_check = true;
3962
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
+ }
3878
3966
  } else {
3879
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3967
+ const int mmq_x = 64;
3968
+ const int mmq_y = 64;
3969
+ const int nwarps = 4;
3970
+
3971
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3974
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
+
3976
+ if (nrows_x % mmq_y == 0) {
3977
+ const bool need_check = false;
3978
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
+ } else {
3982
+ const bool need_check = true;
3983
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ }
3881
3987
  }
3882
3988
  }
3883
3989
 
@@ -3885,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
3991
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
3992
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
3993
 
3888
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3890
- const dim3 block_nums(block_num_x, block_num_y, 1);
3891
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3892
-
3893
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3994
+ int id;
3995
+ CUDA_CHECK(cudaGetDevice(&id));
3996
+ const int compute_capability = g_compute_capabilities[id];
3997
+
3998
+ if (compute_capability >= CC_TURING) {
3999
+ const int mmq_x = 64;
4000
+ const int mmq_y = 128;
4001
+ const int nwarps = 4;
4002
+
4003
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4006
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
+
4008
+ if (nrows_x % mmq_y == 0) {
4009
+ const bool need_check = false;
4010
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
+ } else {
4014
+ const bool need_check = true;
4015
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
+ }
3896
4019
  } else {
3897
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4020
+ const int mmq_x = 64;
4021
+ const int mmq_y = 64;
4022
+ const int nwarps = 8;
4023
+
4024
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4027
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
+
4029
+ if (nrows_x % mmq_y == 0) {
4030
+ const bool need_check = false;
4031
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
+ } else {
4035
+ const bool need_check = true;
4036
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
+ }
4040
+
3899
4041
  }
3900
4042
  }
3901
4043
 
@@ -3903,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
4045
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
4046
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
4047
 
3906
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3908
- const dim3 block_nums(block_num_x, block_num_y, 1);
3909
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3910
-
3911
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4048
+ int id;
4049
+ CUDA_CHECK(cudaGetDevice(&id));
4050
+ const int compute_capability = g_compute_capabilities[id];
4051
+
4052
+ if (compute_capability >= CC_TURING) {
4053
+ const int mmq_x = 128;
4054
+ const int mmq_y = 64;
4055
+ const int nwarps = 4;
4056
+
4057
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4060
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
+
4062
+ if (nrows_x % mmq_y == 0) {
4063
+ const bool need_check = false;
4064
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
+ } else {
4068
+ const bool need_check = true;
4069
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
+ }
3914
4073
  } else {
3915
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4074
+ const int mmq_x = 64;
4075
+ const int mmq_y = 64;
4076
+ const int nwarps = 8;
4077
+
4078
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4081
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
+
4083
+ if (nrows_x % mmq_y == 0) {
4084
+ const bool need_check = false;
4085
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
+ } else {
4089
+ const bool need_check = true;
4090
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
+ }
3917
4094
  }
3918
4095
  }
3919
4096
 
@@ -3921,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
4098
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
4099
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
4100
 
3924
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3926
- const dim3 block_nums(block_num_x, block_num_y, 1);
3927
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3928
-
3929
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4101
+ int id;
4102
+ CUDA_CHECK(cudaGetDevice(&id));
4103
+ const int compute_capability = g_compute_capabilities[id];
4104
+
4105
+ if (compute_capability >= CC_TURING) {
4106
+ const int mmq_x = 128;
4107
+ const int mmq_y = 64;
4108
+ const int nwarps = 8;
4109
+
4110
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4113
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
+
4115
+ if (nrows_x % mmq_y == 0) {
4116
+ const bool need_check = false;
4117
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
+ } else {
4121
+ const bool need_check = true;
4122
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
+ }
3932
4126
  } else {
3933
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4127
+ const int mmq_x = 64;
4128
+ const int mmq_y = 64;
4129
+ const int nwarps = 8;
4130
+
4131
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4134
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
+
4136
+ if (nrows_x % mmq_y == 0) {
4137
+ const bool need_check = false;
4138
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
+ } else {
4142
+ const bool need_check = true;
4143
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
+ }
3935
4147
  }
3936
4148
  }
3937
4149
 
@@ -3939,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
4151
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
4152
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
4153
 
3942
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3944
- const dim3 block_nums(block_num_x, block_num_y, 1);
3945
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3946
-
3947
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4154
+ int id;
4155
+ CUDA_CHECK(cudaGetDevice(&id));
4156
+ const int compute_capability = g_compute_capabilities[id];
4157
+
4158
+ if (compute_capability >= CC_TURING) {
4159
+ const int mmq_x = 128;
4160
+ const int mmq_y = 64;
4161
+ const int nwarps = 4;
4162
+
4163
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4166
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
+
4168
+ if (nrows_x % mmq_y == 0) {
4169
+ const bool need_check = false;
4170
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
+ } else {
4174
+ const bool need_check = true;
4175
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
+ }
3950
4179
  } else {
3951
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4180
+ const int mmq_x = 64;
4181
+ const int mmq_y = 64;
4182
+ const int nwarps = 8;
4183
+
4184
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4187
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
+
4189
+ if (nrows_x % mmq_y == 0) {
4190
+ const bool need_check = false;
4191
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
+ } else {
4195
+ const bool need_check = true;
4196
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
+ }
3953
4200
  }
3954
4201
  }
3955
4202
 
@@ -3957,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
4204
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
4205
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
4206
 
3960
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3962
- const dim3 block_nums(block_num_x, block_num_y, 1);
3963
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3964
-
3965
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4207
+ int id;
4208
+ CUDA_CHECK(cudaGetDevice(&id));
4209
+ const int compute_capability = g_compute_capabilities[id];
4210
+
4211
+ if (compute_capability >= CC_TURING) {
4212
+ const int mmq_x = 64;
4213
+ const int mmq_y = 128;
4214
+ const int nwarps = 4;
4215
+
4216
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4219
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
+
4221
+ if (nrows_x % mmq_y == 0) {
4222
+ const bool need_check = false;
4223
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
+ } else {
4227
+ const bool need_check = true;
4228
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
+ }
3968
4232
  } else {
3969
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4233
+ const int mmq_x = 64;
4234
+ const int mmq_y = 64;
4235
+ const int nwarps = 8;
4236
+
4237
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4240
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
+
4242
+ if (nrows_x % mmq_y == 0) {
4243
+ const bool need_check = false;
4244
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
+ } else {
4248
+ const bool need_check = true;
4249
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
+ }
3971
4253
  }
3972
4254
  }
3973
4255
 
@@ -3975,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
4257
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
4258
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
4259
 
3978
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3980
- const dim3 block_nums(block_num_x, block_num_y, 1);
3981
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3982
-
3983
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4260
+ int id;
4261
+ CUDA_CHECK(cudaGetDevice(&id));
4262
+ const int compute_capability = g_compute_capabilities[id];
4263
+
4264
+ if (compute_capability >= CC_TURING) {
4265
+ const int mmq_x = 128;
4266
+ const int mmq_y = 128;
4267
+ const int nwarps = 4;
4268
+
4269
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4272
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
+
4274
+ if (nrows_x % mmq_y == 0) {
4275
+ const bool need_check = false;
4276
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
+ } else {
4280
+ const bool need_check = true;
4281
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
+ }
3986
4285
  } else {
3987
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4286
+ const int mmq_x = 64;
4287
+ const int mmq_y = 64;
4288
+ const int nwarps = 8;
4289
+
4290
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4293
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
+
4295
+ if (nrows_x % mmq_y == 0) {
4296
+ const bool need_check = false;
4297
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
+ } else {
4301
+ const bool need_check = true;
4302
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
+ }
3989
4306
  }
3990
4307
  }
3991
4308
 
@@ -3993,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
4310
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
4311
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
4312
 
3996
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3998
- const dim3 block_nums(block_num_x, block_num_y, 1);
3999
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4000
-
4001
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4313
+ int id;
4314
+ CUDA_CHECK(cudaGetDevice(&id));
4315
+ const int compute_capability = g_compute_capabilities[id];
4316
+
4317
+ if (compute_capability >= CC_TURING) {
4318
+ const int mmq_x = 64;
4319
+ const int mmq_y = 128;
4320
+ const int nwarps = 4;
4321
+
4322
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4325
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
+
4327
+ if (nrows_x % mmq_y == 0) {
4328
+ const bool need_check = false;
4329
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
+ } else {
4333
+ const bool need_check = true;
4334
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
+ }
4004
4338
  } else {
4005
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4339
+ const int mmq_x = 32;
4340
+ const int mmq_y = 64;
4341
+ const int nwarps = 8;
4342
+
4343
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4346
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
+
4348
+ if (nrows_x % mmq_y == 0) {
4349
+ const bool need_check = false;
4350
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
+ } else {
4354
+ const bool need_check = true;
4355
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
+ }
4007
4359
  }
4008
4360
  }
4009
4361
 
@@ -4011,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
4363
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
4364
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
4365
 
4014
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4016
- const dim3 block_nums(block_num_x, block_num_y, 1);
4017
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4018
-
4019
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4366
+ int id;
4367
+ CUDA_CHECK(cudaGetDevice(&id));
4368
+ const int compute_capability = g_compute_capabilities[id];
4369
+
4370
+ if (compute_capability >= CC_TURING) {
4371
+ const int mmq_x = 64;
4372
+ const int mmq_y = 128;
4373
+ const int nwarps = 4;
4374
+
4375
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4378
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
+
4380
+ if (nrows_x % mmq_y == 0) {
4381
+ const bool need_check = false;
4382
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
+ } else {
4386
+ const bool need_check = true;
4387
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
+ }
4022
4391
  } else {
4023
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4392
+ const int mmq_x = 64;
4393
+ const int mmq_y = 64;
4394
+ const int nwarps = 8;
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ } else {
4407
+ const bool need_check = true;
4408
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
+ }
4025
4412
  }
4026
4413
  }
4027
4414
 
@@ -4029,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
4416
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
4417
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
4418
 
4032
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4034
- const dim3 block_nums(block_num_x, block_num_y, 1);
4035
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4036
-
4037
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4419
+ int id;
4420
+ CUDA_CHECK(cudaGetDevice(&id));
4421
+ const int compute_capability = g_compute_capabilities[id];
4422
+
4423
+ if (compute_capability >= CC_TURING) {
4424
+ const int mmq_x = 64;
4425
+ const int mmq_y = 64;
4426
+ const int nwarps = 4;
4427
+
4428
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4431
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
+
4433
+ if (nrows_x % mmq_y == 0) {
4434
+ const bool need_check = false;
4435
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ } else {
4439
+ const bool need_check = true;
4440
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ }
4040
4444
  } else {
4041
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4445
+ const int mmq_x = 32;
4446
+ const int mmq_y = 64;
4447
+ const int nwarps = 8;
4448
+
4449
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4452
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
+
4454
+ if (nrows_x % mmq_y == 0) {
4455
+ const bool need_check = false;
4456
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
+ } else {
4460
+ const bool need_check = true;
4461
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
+ }
4043
4465
  }
4044
4466
  }
4045
4467
 
@@ -4214,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
4214
4636
  }
4215
4637
 
4216
4638
 
4217
- static void * g_scratch_buffer = nullptr;
4218
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
4219
- static size_t g_scratch_offset = 0;
4220
-
4221
- static int g_device_count = -1;
4222
- static int g_main_device = 0;
4223
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
4224
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
- static bool g_mul_mat_q = false;
4226
-
4227
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
4228
-
4229
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
4230
-
4231
4639
  void ggml_init_cublas() {
4232
4640
  static bool initialized = false;
4233
4641
 
@@ -4583,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q(
4583
4991
  (void) i1;
4584
4992
  }
4585
4993
 
4994
+ static int64_t get_row_rounding(ggml_type type) {
4995
+ int max_compute_capability = INT_MIN;
4996
+ for (int id = 0; id < g_device_count; ++id) {
4997
+ if (max_compute_capability < g_compute_capabilities[id]
4998
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
4999
+ max_compute_capability = g_compute_capabilities[id];
5000
+ }
5001
+ }
5002
+
5003
+ switch(type) {
5004
+ case GGML_TYPE_Q4_0:
5005
+ case GGML_TYPE_Q4_1:
5006
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5007
+ case GGML_TYPE_Q5_0:
5008
+ case GGML_TYPE_Q5_1:
5009
+ case GGML_TYPE_Q8_0:
5010
+ return 64;
5011
+ case GGML_TYPE_F16:
5012
+ return 1;
5013
+ case GGML_TYPE_Q2_K:
5014
+ case GGML_TYPE_Q3_K:
5015
+ case GGML_TYPE_Q4_K:
5016
+ case GGML_TYPE_Q5_K:
5017
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5018
+ case GGML_TYPE_Q6_K:
5019
+ return 64;
5020
+ default:
5021
+ GGML_ASSERT(false);
5022
+ }
5023
+ }
5024
+
4586
5025
  inline void ggml_cuda_op_mul_mat_vec(
4587
5026
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4588
5027
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -4983,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
4983
5422
 
4984
5423
  int64_t row_low, row_high;
4985
5424
  if (split) {
5425
+ const int64_t rounding = get_row_rounding(src0->type);
5426
+
4986
5427
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
4987
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5428
+ row_low -= row_low % rounding;
4988
5429
 
4989
5430
  if (id == g_device_count - 1) {
4990
5431
  row_high = nrows0;
4991
5432
  } else {
4992
5433
  row_high = nrows0*g_tensor_split[id + 1];
4993
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5434
+ row_high -= row_high % rounding;
4994
5435
  }
4995
5436
  } else {
4996
5437
  row_low = 0;
@@ -5203,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
5203
5644
  if (split && g_device_count > 1) {
5204
5645
  CUDA_CHECK(cudaSetDevice(g_main_device));
5205
5646
  for (int id = 0; id < g_device_count; ++id) {
5206
- if (id != g_main_device) {
5647
+ if (id != g_main_device && src0_extra->events[id]) {
5207
5648
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
5208
5649
  }
5209
5650
  }
@@ -5347,7 +5788,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
5347
5788
  } else {
5348
5789
  int min_compute_capability = INT_MAX;
5349
5790
  for (int id = 0; id < g_device_count; ++id) {
5350
- if (min_compute_capability > g_compute_capabilities[id]) {
5791
+ if (min_compute_capability > g_compute_capabilities[id]
5792
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5351
5793
  min_compute_capability = g_compute_capabilities[id];
5352
5794
  }
5353
5795
  }
@@ -5468,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
5468
5910
  row_low = 0;
5469
5911
  row_high = nrows;
5470
5912
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
5913
+ const int64_t rounding = get_row_rounding(tensor->type);
5914
+
5471
5915
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
5472
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5916
+ row_low -= row_low % rounding;
5473
5917
 
5474
5918
  if (id == g_device_count - 1) {
5475
5919
  row_high = nrows;
5476
5920
  } else {
5477
5921
  row_high = nrows*g_tensor_split[id + 1];
5478
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5922
+ row_high -= row_high % rounding;
5479
5923
  }
5480
5924
  } else {
5481
5925
  GGML_ASSERT(false);