llama_cpp 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
262
263
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
263
264
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
264
265
 
265
- #ifndef GGML_CUDA_MMQ_Y
266
- #define GGML_CUDA_MMQ_Y 64
267
- #endif // GGML_CUDA_MMQ_Y
268
-
269
266
  // dmmv = dequantize_mul_mat_vec
270
267
  #ifndef GGML_CUDA_DMMV_X
271
268
  #define GGML_CUDA_DMMV_X 32
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
285
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
286
283
  };
287
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
288
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
289
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
290
301
 
@@ -1383,8 +1394,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1383
1394
  sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
1395
  }
1385
1396
 
1397
+ const float2 ds8f = __half22float2(ds8);
1398
+
1386
1399
  // second part effectively subtracts 8 from each quant value
1387
- return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1388
1401
  #else
1389
1402
  return 0.0f; // only to satisfy the compiler
1390
1403
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1410,12 +1423,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1410
1423
  }
1411
1424
 
1412
1425
  #ifdef GGML_CUDA_F16
1413
- const half2 tmp = __hmul2(dm4, ds8);
1414
- const float d4d8 = __half2float(tmp.x);
1415
- const float m4s8 = __half2float(tmp.y);
1426
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1427
+ const float d4d8 = tmp.x;
1428
+ const float m4s8 = tmp.y;
1416
1429
  #else
1417
- const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
- const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1430
+ const float2 dm4f = __half22float2(dm4);
1431
+ const float2 ds8f = __half22float2(ds8);
1432
+ const float d4d8 = dm4f.x * ds8f.x;
1433
+ const float m4s8 = dm4f.y * ds8f.y;
1419
1434
  #endif // GGML_CUDA_F16
1420
1435
 
1421
1436
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
@@ -1434,6 +1449,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1434
1449
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1435
1450
  int sumi = 0;
1436
1451
 
1452
+ #pragma unroll
1437
1453
  for (int i = 0; i < vdr; ++i) {
1438
1454
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
1455
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1450,8 +1466,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1450
1466
  sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
1467
  }
1452
1468
 
1469
+ const float2 ds8f = __half22float2(ds8);
1470
+
1453
1471
  // second part effectively subtracts 16 from each quant value
1454
- return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1472
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1455
1473
  #else
1456
1474
  return 0.0f; // only to satisfy the compiler
1457
1475
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1466,6 +1484,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1466
1484
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1467
1485
  int sumi = 0;
1468
1486
 
1487
+ #pragma unroll
1469
1488
  for (int i = 0; i < vdr; ++i) {
1470
1489
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
1490
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1483,12 +1502,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1483
1502
  }
1484
1503
 
1485
1504
  #ifdef GGML_CUDA_F16
1486
- const half2 tmp = __hmul2(dm5, ds8);
1487
- const float d5d8 = __half2float(tmp.x);
1488
- const float m5s8 = __half2float(tmp.y);
1505
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1506
+ const float d5d8 = tmp.x;
1507
+ const float m5s8 = tmp.y;
1489
1508
  #else
1490
- const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
- const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1509
+ const float2 dm5f = __half22float2(dm5);
1510
+ const float2 ds8f = __half22float2(ds8);
1511
+ const float d5d8 = dm5f.x * ds8f.x;
1512
+ const float m5s8 = dm5f.y * ds8f.y;
1492
1513
  #endif // GGML_CUDA_F16
1493
1514
 
1494
1515
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
@@ -1503,17 +1524,18 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1503
1524
  #define VDR_Q8_0_Q8_1_MMQ 8
1504
1525
 
1505
1526
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
- const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1527
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1507
1528
 
1508
1529
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
1530
  int sumi = 0;
1510
1531
 
1532
+ #pragma unroll
1511
1533
  for (int i = 0; i < vdr; ++i) {
1512
1534
  // SIMD dot product of quantized values
1513
1535
  sumi = __dp4a(v[i], u[i], sumi);
1514
1536
  }
1515
1537
 
1516
- return sumi * d8_0 * __half2float(ds8_1.x);
1538
+ return d8_0*d8_1 * sumi;
1517
1539
  #else
1518
1540
  return 0.0f; // only to satisfy the compiler
1519
1541
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1525,18 +1547,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1525
1547
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1526
1548
  int sumi = 0;
1527
1549
 
1550
+ #pragma unroll
1528
1551
  for (int i = 0; i < vdr; ++i) {
1529
1552
  // SIMD dot product of quantized values
1530
1553
  sumi = __dp4a(v[i], u[i], sumi);
1531
1554
  }
1532
1555
 
1533
1556
  #ifdef GGML_CUDA_F16
1534
- const half2 tmp = __hmul2(dm8, ds8);
1535
- const float d8d8 = __half2float(tmp.x);
1536
- const float m8s8 = __half2float(tmp.y);
1557
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1558
+ const float d8d8 = tmp.x;
1559
+ const float m8s8 = tmp.y;
1537
1560
  #else
1538
- const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
- const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1561
+ const float2 dm8f = __half22float2(dm8);
1562
+ const float2 ds8f = __half22float2(ds8);
1563
+ const float d8d8 = dm8f.x * ds8f.x;
1564
+ const float m8s8 = dm8f.y * ds8f.y;
1540
1565
  #endif // GGML_CUDA_F16
1541
1566
 
1542
1567
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
@@ -1546,6 +1571,312 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1546
1571
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
1572
  }
1548
1573
 
1574
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1575
+ #define VDR_Q2_K_Q8_1_MMQ 2
1576
+
1577
+ // contiguous v/x values
1578
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1579
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1580
+ const half2 & dm2, const float * __restrict__ d8) {
1581
+
1582
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1583
+ float sumf_d = 0.0f;
1584
+ float sumf_m = 0.0f;
1585
+
1586
+ #pragma unroll
1587
+ for (int i = 0; i < QR2_K; ++i) {
1588
+ const int sc = scales[2*i];
1589
+
1590
+ const int vi = (v >> (2*i)) & 0x03030303;
1591
+
1592
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1593
+
1594
+ // fill int with 4x m
1595
+ int m = sc >> 4;
1596
+ m |= m << 8;
1597
+ m |= m << 16;
1598
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1599
+ }
1600
+
1601
+ const float2 dm2f = __half22float2(dm2);
1602
+
1603
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
+ #else
1605
+ return 0.0f; // only to satisfy the compiler
1606
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
+ }
1608
+
1609
+ // contiguous u/y values
1610
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1611
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1612
+ const half2 & dm2, const float & d8) {
1613
+
1614
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1615
+ int sumi_d = 0;
1616
+ int sumi_m = 0;
1617
+
1618
+ #pragma unroll
1619
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1620
+ int sumi_d_sc = 0;
1621
+
1622
+ const int sc = scales[i0 / (QI8_1/2)];
1623
+
1624
+ // fill int with 4x m
1625
+ int m = sc >> 4;
1626
+ m |= m << 8;
1627
+ m |= m << 16;
1628
+
1629
+ #pragma unroll
1630
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1631
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1632
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1633
+ }
1634
+
1635
+ sumi_d += sumi_d_sc * (sc & 0xF);
1636
+ }
1637
+
1638
+ const float2 dm2f = __half22float2(dm2);
1639
+
1640
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
+ #else
1642
+ return 0.0f; // only to satisfy the compiler
1643
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
+ }
1645
+
1646
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1647
+ #define VDR_Q3_K_Q8_1_MMQ 2
1648
+
1649
+ // contiguous v/x values
1650
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1651
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1652
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1653
+
1654
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1655
+ float sumf = 0.0f;
1656
+
1657
+ #pragma unroll
1658
+ for (int i = 0; i < QR3_K; ++i) {
1659
+ const int isc = scale_offset + 2*i;
1660
+
1661
+ const int isc_low = isc % (QK_K/32);
1662
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1663
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1664
+
1665
+ const int isc_high = isc % (QK_K/64);
1666
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1667
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1668
+
1669
+ const int sc = (sc_low | sc_high) - 32;
1670
+
1671
+ const int vil = (vl >> (2*i)) & 0x03030303;
1672
+
1673
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1674
+
1675
+ const int vi = __vsubss4(vil, vih);
1676
+
1677
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1678
+ }
1679
+
1680
+ return d3 * sumf;
1681
+ #else
1682
+ return 0.0f; // only to satisfy the compiler
1683
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
+ }
1685
+
1686
+ // contiguous u/y values
1687
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1688
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1689
+ const float & d3, const float & d8) {
1690
+
1691
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1692
+ int sumi = 0;
1693
+
1694
+ #pragma unroll
1695
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1696
+ int sumi_sc = 0;
1697
+
1698
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1699
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1700
+ }
1701
+
1702
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1703
+ }
1704
+
1705
+ return d3*d8 * sumi;
1706
+ #else
1707
+ return 0.0f; // only to satisfy the compiler
1708
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
+ }
1710
+
1711
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1712
+ #define VDR_Q4_K_Q8_1_MMQ 8
1713
+
1714
+ // contiguous v/x values
1715
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1716
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1717
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1718
+
1719
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1720
+ float sumf_d = 0.0f;
1721
+ float sumf_m = 0.0f;
1722
+
1723
+ #pragma unroll
1724
+ for (int i = 0; i < QR4_K; ++i) {
1725
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1726
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1727
+
1728
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1729
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1730
+
1731
+ sumf_d += d8[i] * (dot1 * sc[i]);
1732
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1733
+ }
1734
+
1735
+ const float2 dm4f = __half22float2(dm4);
1736
+
1737
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
+
1739
+ #else
1740
+ return 0.0f; // only to satisfy the compiler
1741
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
+ }
1743
+
1744
+ // contiguous u/y values
1745
+ // also used for q5_K
1746
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1749
+
1750
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1751
+ float sumf_d = 0.0f;
1752
+ float sumf_m = 0.0f;
1753
+
1754
+ #pragma unroll
1755
+ for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1756
+ int sumi_d = 0;
1757
+
1758
+ #pragma unroll
1759
+ for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
+ sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
+ sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1762
+ }
1763
+
1764
+ const float2 ds8f = __half22float2(ds8[i0 / 4]);
1765
+
1766
+ sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
+ sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1768
+ }
1769
+
1770
+ const float2 dm4f = __half22float2(dm4);
1771
+
1772
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
+
1774
+ #else
1775
+ return 0.0f; // only to satisfy the compiler
1776
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
+ }
1778
+
1779
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1780
+ #define VDR_Q5_K_Q8_1_MMQ 8
1781
+
1782
+ // contiguous v/x values
1783
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1784
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
+
1787
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1788
+ float sumf_d = 0.0f;
1789
+ float sumf_m = 0.0f;
1790
+
1791
+ #pragma unroll
1792
+ for (int i = 0; i < QR5_K; ++i) {
1793
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1794
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1795
+
1796
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1797
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1798
+
1799
+ const int v0i = vl0i | vh0i;
1800
+ const int v1i = vl1i | vh1i;
1801
+
1802
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1803
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1804
+
1805
+ sumf_d += d8[i] * (dot1 * sc[i]);
1806
+ sumf_m += d8[i] * (dot2 * m[i]);
1807
+
1808
+ }
1809
+
1810
+ const float2 dm5f = __half22float2(dm5);
1811
+
1812
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
+
1814
+ #else
1815
+ return 0.0f; // only to satisfy the compiler
1816
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
+ }
1818
+
1819
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1820
+ #define VDR_Q6_K_Q8_1_MMQ 8
1821
+
1822
+ // contiguous v/x values
1823
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1824
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1825
+ const float & d, const float * __restrict__ d8) {
1826
+
1827
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1828
+ float sumf = 0.0f;
1829
+
1830
+ #pragma unroll
1831
+ for (int i = 0; i < QR6_K; ++i) {
1832
+ const int sc = scales[4*i];
1833
+
1834
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1835
+
1836
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1837
+
1838
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1839
+
1840
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1841
+ }
1842
+
1843
+ return d*sumf;
1844
+ #else
1845
+ return 0.0f; // only to satisfy the compiler
1846
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
+ }
1848
+
1849
+ // contiguous u/y values
1850
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1851
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1852
+ const float & d6, const float * __restrict__ d8) {
1853
+
1854
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1855
+ float sumf_d = 0.0f;
1856
+
1857
+ #pragma unroll
1858
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1859
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1860
+
1861
+ #pragma unroll
1862
+ for (int i = i0; i < i0 + 2; ++i) {
1863
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1864
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1865
+
1866
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1867
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1868
+ }
1869
+
1870
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1871
+ }
1872
+
1873
+ return d6 * sumf_d;
1874
+
1875
+ #else
1876
+ return 0.0f; // only to satisfy the compiler
1877
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
+ }
1879
+
1549
1880
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1550
1881
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1551
1882
 
@@ -1564,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1564
1895
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
1896
  }
1566
1897
 
1567
- static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1898
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
1899
 
1569
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1900
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1901
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1571
1902
 
1572
1903
  *x_ql = tile_x_qs;
1573
1904
  *x_dm = (half2 *) tile_x_d;
1574
1905
  }
1575
1906
 
1576
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1907
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
1908
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
1909
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
1910
 
1580
1911
  __builtin_assume(i_offset >= 0);
1581
- __builtin_assume(i_offset < 8);
1912
+ __builtin_assume(i_offset < nwarps);
1582
1913
  __builtin_assume(k >= 0);
1583
1914
  __builtin_assume(k < WARP_SIZE);
1584
1915
 
@@ -1590,7 +1921,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1590
1921
  float * x_dmf = (float *) x_dm;
1591
1922
 
1592
1923
  #pragma unroll
1593
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1924
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1594
1925
  int i = i0 + i_offset;
1595
1926
 
1596
1927
  if (need_check) {
@@ -1600,38 +1931,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1600
1931
  const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
1932
 
1602
1933
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1934
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1604
1935
  }
1605
1936
 
1606
- // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
- // const int kbxd = k % blocks_per_tile_x_row;
1937
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1938
+ const int kbxd = k % blocks_per_tile_x_row;
1608
1939
 
1609
- // #pragma unroll
1610
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
- // FIXME out-of-bounds
1612
- // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1940
+ #pragma unroll
1941
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1942
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
1943
 
1614
- // if (i >= GGML_CUDA_MMQ_Y) {
1615
- // return;
1616
- // }
1944
+ if (need_check) {
1945
+ i = min(i, i_max);
1946
+ }
1617
1947
 
1618
- // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1948
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
1949
 
1620
- // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
- // }
1950
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1951
+ }
1622
1952
  }
1623
1953
 
1624
1954
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
1955
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
1956
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1627
1957
 
1628
- __builtin_assume(i >= 0);
1629
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
- __builtin_assume(j >= 0);
1631
- __builtin_assume(j < WARP_SIZE);
1632
- __builtin_assume(k >= 0);
1633
- __builtin_assume(k < WARP_SIZE);
1634
-
1635
1958
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
1959
  const float * x_dmf = (float *) x_dm;
1637
1960
 
@@ -1639,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1639
1962
 
1640
1963
  #pragma unroll
1641
1964
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
1965
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
1966
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1644
1967
  }
1645
1968
 
1646
1969
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
1970
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
1971
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1649
1972
  }
1650
1973
 
1651
1974
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -1666,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1666
1989
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
1990
  }
1668
1991
 
1669
- static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1992
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1670
1993
 
1671
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
1994
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
1995
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1673
1996
 
1674
1997
  *x_ql = tile_x_qs;
1675
1998
  *x_dm = tile_x_dm;
1676
1999
  }
1677
2000
 
1678
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2001
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
2002
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
2003
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
2004
 
1682
2005
  __builtin_assume(i_offset >= 0);
1683
- __builtin_assume(i_offset < 8);
2006
+ __builtin_assume(i_offset < nwarps);
1684
2007
  __builtin_assume(k >= 0);
1685
2008
  __builtin_assume(k < WARP_SIZE);
1686
2009
 
@@ -1690,7 +2013,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1690
2013
  const block_q4_1 * bx0 = (block_q4_1 *) vx;
1691
2014
 
1692
2015
  #pragma unroll
1693
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2016
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1694
2017
  int i = i0 + i_offset;
1695
2018
 
1696
2019
  if (need_check) {
@@ -1706,7 +2029,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1706
2029
  const int kbxd = k % blocks_per_tile_x_row;
1707
2030
 
1708
2031
  #pragma unroll
1709
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
2032
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
1710
2033
  int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
2034
 
1712
2035
  if (need_check) {
@@ -1723,26 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
2046
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
2047
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1725
2048
 
1726
- __builtin_assume(i >= 0);
1727
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
- __builtin_assume(j >= 0);
1729
- __builtin_assume(j < WARP_SIZE);
1730
- __builtin_assume(k >= 0);
1731
- __builtin_assume(k < WARP_SIZE);
1732
-
1733
2049
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1734
2050
 
1735
2051
  int u[2*VDR_Q4_1_Q8_1_MMQ];
1736
2052
 
1737
2053
  #pragma unroll
1738
2054
  for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
2055
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2056
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
1741
2057
  }
1742
2058
 
1743
2059
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
2060
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2061
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1746
2062
  }
1747
2063
 
1748
2064
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -1765,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1765
2081
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
2082
  }
1767
2083
 
1768
- static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2084
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
2085
 
1770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
2086
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2087
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
1772
2088
 
1773
2089
  *x_ql = tile_x_ql;
1774
2090
  *x_dm = (half2 *) tile_x_d;
1775
2091
  }
1776
2092
 
1777
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2093
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
2094
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
2095
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
2096
 
1781
2097
  __builtin_assume(i_offset >= 0);
1782
- __builtin_assume(i_offset < 8);
2098
+ __builtin_assume(i_offset < nwarps);
1783
2099
  __builtin_assume(k >= 0);
1784
2100
  __builtin_assume(k < WARP_SIZE);
1785
2101
 
@@ -1789,7 +2105,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1789
2105
  const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
2106
 
1791
2107
  #pragma unroll
1792
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2108
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1793
2109
  int i = i0 + i_offset;
1794
2110
 
1795
2111
  if (need_check) {
@@ -1825,7 +2141,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1825
2141
  float * x_dmf = (float *) x_dm;
1826
2142
 
1827
2143
  #pragma unroll
1828
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
2144
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
1829
2145
  int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
2146
 
1831
2147
  if (need_check) {
@@ -1842,27 +2158,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
2158
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
2159
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
2160
 
1845
- __builtin_assume(i >= 0);
1846
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
- __builtin_assume(j >= 0);
1848
- __builtin_assume(j < WARP_SIZE);
1849
- __builtin_assume(k >= 0);
1850
- __builtin_assume(k < WARP_SIZE);
1851
-
1852
2161
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
2162
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
- const float * x_dmf = (float *) x_dm;
2163
+ const float * x_dmf = (const float *) x_dm;
2164
+ const float * y_df = (const float *) y_ds;
1855
2165
 
1856
2166
  int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
2167
 
1858
2168
  #pragma unroll
1859
2169
  for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
2170
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2171
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
1862
2172
  }
1863
2173
 
1864
2174
  return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2175
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1866
2176
  }
1867
2177
 
1868
2178
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -1885,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1885
2195
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
2196
  }
1887
2197
 
1888
- static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2198
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
2199
 
1890
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
2200
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2201
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
1892
2202
 
1893
2203
  *x_ql = tile_x_ql;
1894
2204
  *x_dm = tile_x_dm;
1895
2205
  }
1896
2206
 
1897
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2207
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
2208
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
2209
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
2210
 
1901
2211
  __builtin_assume(i_offset >= 0);
1902
- __builtin_assume(i_offset < 8);
2212
+ __builtin_assume(i_offset < nwarps);
1903
2213
  __builtin_assume(k >= 0);
1904
2214
  __builtin_assume(k < WARP_SIZE);
1905
2215
 
@@ -1909,7 +2219,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1909
2219
  const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
2220
 
1911
2221
  #pragma unroll
1912
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2222
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1913
2223
  int i = i0 + i_offset;
1914
2224
 
1915
2225
  if (need_check) {
@@ -1942,7 +2252,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1942
2252
  const int kbxd = k % blocks_per_tile_x_row;
1943
2253
 
1944
2254
  #pragma unroll
1945
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
2255
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
1946
2256
  int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
2257
 
1948
2258
  if (need_check) {
@@ -1959,13 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
2269
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
2270
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
2271
 
1962
- __builtin_assume(i >= 0);
1963
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
- __builtin_assume(j >= 0);
1965
- __builtin_assume(j < WARP_SIZE);
1966
- __builtin_assume(k >= 0);
1967
- __builtin_assume(k < WARP_SIZE);
1968
-
1969
2272
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
2273
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
2274
 
@@ -1973,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1973
2276
 
1974
2277
  #pragma unroll
1975
2278
  for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
2279
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2280
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
1978
2281
  }
1979
2282
 
1980
2283
  return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2284
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1982
2285
  }
1983
2286
 
1984
2287
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -1989,29 +2292,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1989
2292
  int v[VDR_Q8_0_Q8_1_MMVQ];
1990
2293
  int u[VDR_Q8_0_Q8_1_MMVQ];
1991
2294
 
2295
+ #pragma unroll
1992
2296
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
2297
  v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
2298
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
2299
  }
1996
2300
 
1997
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
2301
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
1998
2302
  }
1999
2303
 
2000
- static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2304
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
2305
 
2002
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2306
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2307
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2004
2308
 
2005
2309
  *x_ql = tile_x_qs;
2006
2310
  *x_dm = (half2 *) tile_x_d;
2007
2311
  }
2008
2312
 
2009
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2313
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
2314
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
2315
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
2316
 
2013
2317
  __builtin_assume(i_offset >= 0);
2014
- __builtin_assume(i_offset < 8);
2318
+ __builtin_assume(i_offset < nwarps);
2015
2319
  __builtin_assume(k >= 0);
2016
2320
  __builtin_assume(k < WARP_SIZE);
2017
2321
 
@@ -2022,7 +2326,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2022
2326
  const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
2327
 
2024
2328
  #pragma unroll
2025
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2329
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2026
2330
  int i = i0 + i_offset;
2027
2331
 
2028
2332
  if (need_check) {
@@ -2032,76 +2336,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2032
2336
  const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
2337
 
2034
2338
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
2339
  }
2037
2340
 
2038
- // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
- // const int kbxd = k % blocks_per_tile_x_row;
2341
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2342
+ const int kbxd = k % blocks_per_tile_x_row;
2040
2343
 
2041
- // #pragma unroll
2042
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
- // FIXME out-of-bounds
2044
- // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2344
+ #pragma unroll
2345
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2346
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
2347
 
2046
- // #if GGML_CUDA_MMQ_Y < 64
2047
- // if (i >= GGML_CUDA_MMQ_Y) {
2048
- // return;
2049
- // }
2050
- // #endif // GGML_CUDA_MMQ_Y < 64
2348
+ if (need_check) {
2349
+ i = min(i, i_max);
2350
+ }
2051
2351
 
2052
- // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2352
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
2353
 
2054
- // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
- // }
2354
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2355
+ }
2056
2356
  }
2057
2357
 
2058
2358
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
2359
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
2360
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
2361
 
2062
- __builtin_assume(i >= 0);
2063
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
- __builtin_assume(j >= 0);
2065
- __builtin_assume(j < WARP_SIZE);
2066
- __builtin_assume(k >= 0);
2067
- __builtin_assume(k < WARP_SIZE);
2068
-
2069
- const float * x_dmf = (float *) x_dm;
2362
+ const float * x_dmf = (const float *) x_dm;
2363
+ const float * y_df = (const float *) y_ds;
2070
2364
 
2071
2365
  return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
2366
  (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
- y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
- }
2075
-
2076
- #define VDR_q2_K_q8_1 1
2077
-
2078
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
- const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
- const half2 & dm, const float * __restrict__ d8) {
2081
-
2082
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
- float sumf_d = 0.0f;
2084
- float sumf_m = 0.0f;
2085
-
2086
- for (int i = 0; i < QR2_K; ++i) {
2087
- const int sc = scales[2*i];
2088
-
2089
- const int vi = (v >> (2*i)) & 0x03030303;
2090
-
2091
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
-
2093
- int sc_high = sc >> 4;
2094
- sc_high |= sc_high << 8;
2095
- sc_high |= sc_high << 16;
2096
- sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
- }
2098
-
2099
- const float2 dmf = __half22float2(dm);
2100
-
2101
- return dmf.x*sumf_d - dmf.y*sumf_m;
2102
- #else
2103
- return 0.0f; // only to satisfy the compiler
2104
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2367
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2105
2368
  }
2106
2369
 
2107
2370
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
@@ -2115,34 +2378,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2115
2378
  const uint8_t * scales = bq2_K->scales + scale_offset;
2116
2379
 
2117
2380
  const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
- int u[QR2_K];
2381
+ int u[QR2_K];
2119
2382
  float d8[QR2_K];
2120
2383
 
2384
+ #pragma unroll
2121
2385
  for (int i = 0; i < QR2_K; ++ i) {
2122
2386
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
2387
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
2388
  }
2125
2389
 
2126
- return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2390
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2127
2391
  }
2128
2392
 
2129
- static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2393
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
2394
 
2131
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2395
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2396
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2397
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2134
2398
 
2135
2399
  *x_ql = tile_x_ql;
2136
2400
  *x_dm = tile_x_dm;
2137
2401
  *x_sc = tile_x_sc;
2138
2402
  }
2139
2403
 
2140
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2404
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
2405
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
2406
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
2407
 
2144
2408
  __builtin_assume(i_offset >= 0);
2145
- __builtin_assume(i_offset < 8);
2409
+ __builtin_assume(i_offset < nwarps);
2146
2410
  __builtin_assume(k >= 0);
2147
2411
  __builtin_assume(k < WARP_SIZE);
2148
2412
 
@@ -2152,7 +2416,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2152
2416
  const block_q2_K * bx0 = (block_q2_K *) vx;
2153
2417
 
2154
2418
  #pragma unroll
2155
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2419
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2156
2420
  int i = i0 + i_offset;
2157
2421
 
2158
2422
  if (need_check) {
@@ -2168,8 +2432,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2168
2432
  const int kbxd = k % blocks_per_tile_x_row;
2169
2433
 
2170
2434
  #pragma unroll
2171
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2435
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2436
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2173
2437
 
2174
2438
  if (need_check) {
2175
2439
  i = min(i, i_max);
@@ -2181,7 +2445,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2181
2445
  }
2182
2446
 
2183
2447
  #pragma unroll
2184
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2448
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2185
2449
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
2450
 
2187
2451
  if (need_check) {
@@ -2198,68 +2462,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
2462
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
2463
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
2464
 
2201
- __builtin_assume(i >= 0);
2202
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
- __builtin_assume(j >= 0);
2204
- __builtin_assume(j < WARP_SIZE);
2205
- __builtin_assume(k >= 0);
2206
- __builtin_assume(k < WARP_SIZE);
2207
-
2208
- const int kbx = k / QI2_K;
2209
- const int kqsx = k % QI2_K;
2465
+ const int kbx = k / QI2_K;
2466
+ const int ky = (k % QI2_K) * QR2_K;
2467
+ const float * y_df = (const float *) y_ds;
2210
2468
 
2211
- const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2469
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2213
2470
 
2214
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2471
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2472
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2215
2473
 
2216
- int u[QR2_K];
2217
- float d8[QR2_K];
2218
-
2219
- for (int l = 0; l < QR2_K; ++ l) {
2220
- const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
- u[l] = y_qs[y_qs_index];
2222
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2474
+ #pragma unroll
2475
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2476
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2223
2477
  }
2224
2478
 
2225
- return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
- }
2227
-
2228
- #define VDR_q3_K_q8_1 1
2229
-
2230
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
- const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
- const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
-
2234
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
- float sumf = 0.0f;
2236
-
2237
- for (int i = 0; i < QR3_K; ++i) {
2238
- const int isc = scale_offset + 2*i;
2479
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2239
2480
 
2240
- const int isc_low = isc % (QK_K/32);
2241
- const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
- const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
-
2244
- const int isc_high = isc % (QK_K/64);
2245
- const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
- const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
-
2248
- const int sc = (sc_low | sc_high) - 32;
2249
-
2250
- const int vil = (vl >> (2*i)) & 0x03030303;
2251
-
2252
- const int vih = ((vh >> i) << 2) & 0x04040404;
2253
-
2254
- const int vi = __vsubss4(vil, vih);
2255
-
2256
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
- }
2258
-
2259
- return d*sumf;
2260
- #else
2261
- return 0.0f; // only to satisfy the compiler
2262
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2481
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2482
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2263
2483
  }
2264
2484
 
2265
2485
  static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
@@ -2277,23 +2497,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2277
2497
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
2498
  const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
2499
 
2280
- int u[QR3_K];
2500
+ int u[QR3_K];
2281
2501
  float d8[QR3_K];
2282
2502
 
2503
+ #pragma unroll
2283
2504
  for (int i = 0; i < QR3_K; ++i) {
2284
2505
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
2506
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
2507
  }
2287
2508
 
2288
- return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2509
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
2510
  }
2290
2511
 
2291
- static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2512
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
2513
 
2293
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2514
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2515
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2516
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2517
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2297
2518
 
2298
2519
  *x_ql = tile_x_ql;
2299
2520
  *x_dm = tile_x_dm;
@@ -2301,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
2301
2522
  *x_sc = tile_x_sc;
2302
2523
  }
2303
2524
 
2304
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2525
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
2526
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
2527
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
2528
 
2308
2529
  __builtin_assume(i_offset >= 0);
2309
- __builtin_assume(i_offset < 8);
2530
+ __builtin_assume(i_offset < nwarps);
2310
2531
  __builtin_assume(k >= 0);
2311
2532
  __builtin_assume(k < WARP_SIZE);
2312
2533
 
@@ -2316,7 +2537,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2316
2537
  const block_q3_K * bx0 = (block_q3_K *) vx;
2317
2538
 
2318
2539
  #pragma unroll
2319
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2540
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2320
2541
  int i = i0 + i_offset;
2321
2542
 
2322
2543
  if (need_check) {
@@ -2330,10 +2551,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2330
2551
 
2331
2552
  const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
2553
  const int kbxd = k % blocks_per_tile_x_row;
2554
+ float * x_dmf = (float *) x_dm;
2333
2555
 
2334
2556
  #pragma unroll
2335
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2557
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2558
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2337
2559
 
2338
2560
  if (need_check) {
2339
2561
  i = min(i, i_max);
@@ -2341,11 +2563,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2341
2563
 
2342
2564
  const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
2565
 
2344
- x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2566
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2345
2567
  }
2346
2568
 
2347
2569
  #pragma unroll
2348
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2570
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2349
2571
  int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
2572
 
2351
2573
  if (need_check) {
@@ -2354,11 +2576,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2354
2576
 
2355
2577
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
2578
 
2357
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2579
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2580
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
2581
  }
2359
2582
 
2360
2583
  #pragma unroll
2361
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2584
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2362
2585
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
2586
 
2364
2587
  if (need_check) {
@@ -2367,7 +2590,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2367
2590
 
2368
2591
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
2592
 
2370
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2593
+ const int ksc = k % (QI3_K/4);
2594
+
2595
+ const int ksc_low = ksc % (QI3_K/8);
2596
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2597
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2598
+
2599
+ const int ksc_high = QI3_K/8;
2600
+ const int shift_high = 2 * ksc;
2601
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2602
+
2603
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2604
+
2605
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2371
2606
  }
2372
2607
  }
2373
2608
 
@@ -2375,63 +2610,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
2610
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2611
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
2612
 
2378
- __builtin_assume(i >= 0);
2379
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
- __builtin_assume(j >= 0);
2381
- __builtin_assume(j < WARP_SIZE);
2382
- __builtin_assume(k >= 0);
2383
- __builtin_assume(k < WARP_SIZE);
2384
-
2385
2613
  const int kbx = k / QI3_K;
2386
- const int kqsx = k % QI3_K;
2387
-
2388
- const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2390
-
2391
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2614
+ const int ky = (k % QI3_K) * QR3_K;
2615
+ const float * x_dmf = (const float *) x_dm;
2616
+ const float * y_df = (const float *) y_ds;
2392
2617
 
2393
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
- const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2618
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2395
2619
 
2396
- int u[QR3_K];
2397
- float d8[QR3_K];
2398
-
2399
- for (int l = 0; l < QR3_K; ++ l) {
2400
- const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
- u[l] = y_qs[y_qs_index];
2402
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
- }
2404
-
2405
- return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
- x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
- }
2408
-
2409
- #define VDR_q4_K_q8_1 2
2410
-
2411
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
- const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
- const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
-
2415
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
- float sumf_d = 0.0f;
2417
- float sumf_m = 0.0f;
2620
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2418
2621
 
2419
- for (int i = 0; i < QR4_K; ++i) {
2420
- const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
- const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2622
+ #pragma unroll
2623
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2624
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2625
+ const int shift = 2 * ((ky % 32) / 8);
2626
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2422
2627
 
2423
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2628
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2629
+ const int vlh = (vh << 2) & 0x04040404;
2425
2630
 
2426
- sumf_d += d8[i] * (dot1 * sc[i]);
2427
- sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2631
+ v[l] = __vsubss4(vll, vlh);
2428
2632
  }
2429
2633
 
2430
- return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
-
2432
- #else
2433
- return 0.0f; // only to satisfy the compiler
2434
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2634
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2635
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2435
2636
  }
2436
2637
 
2437
2638
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
@@ -2478,7 +2679,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2478
2679
  u[2*i+1] = q8[4];
2479
2680
  }
2480
2681
 
2481
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2682
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2482
2683
 
2483
2684
  #else
2484
2685
 
@@ -2527,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2527
2728
  #endif
2528
2729
  }
2529
2730
 
2530
- static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2731
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
2732
 
2532
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2733
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2734
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2735
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2535
2736
 
2536
2737
  *x_ql = tile_x_ql;
2537
2738
  *x_dm = tile_x_dm;
2538
2739
  *x_sc = tile_x_sc;
2539
2740
  }
2540
2741
 
2541
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2742
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
2743
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
2744
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
2745
 
2545
2746
  __builtin_assume(i_offset >= 0);
2546
- __builtin_assume(i_offset < 8);
2747
+ __builtin_assume(i_offset < nwarps);
2547
2748
  __builtin_assume(k >= 0);
2548
2749
  __builtin_assume(k < WARP_SIZE);
2549
2750
 
@@ -2553,7 +2754,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2553
2754
  const block_q4_K * bx0 = (block_q4_K *) vx;
2554
2755
 
2555
2756
  #pragma unroll
2556
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2757
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2557
2758
  int i = i0 + i_offset;
2558
2759
 
2559
2760
  if (need_check) {
@@ -2563,118 +2764,62 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2563
2764
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
2564
2765
 
2565
2766
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2566
- }
2567
-
2568
- const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
-
2571
- #pragma unroll
2572
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2574
-
2575
- if (need_check) {
2576
- i = min(i, i_max);
2577
- }
2578
-
2579
- const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2580
-
2581
- x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2582
- }
2583
-
2584
- #pragma unroll
2585
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2587
-
2588
- if (need_check) {
2589
- i = min(i, i_max);
2590
- }
2591
-
2592
- const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
-
2594
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
- }
2596
- }
2597
-
2598
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
-
2602
- __builtin_assume(i >= 0);
2603
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
- __builtin_assume(j >= 0);
2605
- __builtin_assume(j < WARP_SIZE);
2606
- __builtin_assume(k >= 0);
2607
- __builtin_assume(k < WARP_SIZE);
2608
-
2609
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
-
2612
- int v[2];
2613
- int u[2*QR4_K];
2614
- float d8[QR4_K];
2767
+ }
2615
2768
 
2616
- // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
- const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2769
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2770
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2618
2771
 
2619
- v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
- v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2772
+ #pragma unroll
2773
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2774
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2621
2775
 
2622
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
- uint16_t aux[2];
2624
- const int l = bq8_offset/2;
2625
- if (l < 2) {
2626
- aux[0] = scales[l+0] & 0x3f3f;
2627
- aux[1] = scales[l+2] & 0x3f3f;
2628
- } else {
2629
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2631
- }
2632
- const uint8_t * sc = (const uint8_t *)aux;
2633
- const uint8_t * m = sc + 2;
2776
+ if (need_check) {
2777
+ i = min(i, i_max);
2778
+ }
2779
+
2780
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2634
2781
 
2635
- for (int l = 0; l < QR4_K; ++l) {
2636
- const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
- d8[l] = y_ds[kqsy / QI8_1].x;
2782
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2640
2783
  }
2641
2784
 
2642
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
- }
2785
+ #pragma unroll
2786
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2787
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2644
2788
 
2645
- #define VDR_q5_K_q8_1 2
2789
+ if (need_check) {
2790
+ i = min(i, i_max);
2791
+ }
2646
2792
 
2647
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
- const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
- const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2793
+ const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2650
2794
 
2651
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
- float sumf_d = 0.0f;
2653
- float sumf_m = 0.0f;
2795
+ const int * scales = (int *) bxi->scales;
2654
2796
 
2655
- for (int i = 0; i < QR5_K; ++i) {
2656
- const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
- const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2797
+ const int ksc = k % (WARP_SIZE/8);
2658
2798
 
2659
- const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
- const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2799
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2800
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2801
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2661
2802
 
2662
- const int v0i = vl0i | vh0i;
2663
- const int v1i = vl1i | vh1i;
2803
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2804
+ }
2805
+ }
2664
2806
 
2665
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2807
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2667
2810
 
2668
- sumf_d += d8[i] * (dot1 * sc[i]);
2669
- sumf_m += d8[i] * (dot2 * m[i]);
2811
+ int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2670
2812
 
2813
+ #pragma unroll
2814
+ for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
+ v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
+ v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2671
2817
  }
2672
2818
 
2673
- return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2819
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2674
2820
 
2675
- #else
2676
- return 0.0f; // only to satisfy the compiler
2677
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2821
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
+ return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2678
2823
  }
2679
2824
 
2680
2825
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2711,6 +2856,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2711
2856
  const uint8_t * sc = (const uint8_t *)aux;
2712
2857
  const uint8_t * m = sc + 2;
2713
2858
 
2859
+ #pragma unroll
2714
2860
  for (int i = 0; i < QR5_K; ++i) {
2715
2861
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
2862
  d8[i] = bq8i->ds.x;
@@ -2765,25 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2765
2911
  #endif
2766
2912
  }
2767
2913
 
2768
- static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2914
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
2915
 
2770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2916
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2917
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2918
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2774
2919
 
2775
2920
  *x_ql = tile_x_ql;
2776
2921
  *x_dm = tile_x_dm;
2777
- *x_qh = tile_x_qh;
2778
2922
  *x_sc = tile_x_sc;
2779
2923
  }
2780
2924
 
2781
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2925
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
2926
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
2927
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
2928
 
2785
2929
  __builtin_assume(i_offset >= 0);
2786
- __builtin_assume(i_offset < 8);
2930
+ __builtin_assume(i_offset < nwarps);
2787
2931
  __builtin_assume(k >= 0);
2788
2932
  __builtin_assume(k < WARP_SIZE);
2789
2933
 
@@ -2793,7 +2937,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2793
2937
  const block_q5_K * bx0 = (block_q5_K *) vx;
2794
2938
 
2795
2939
  #pragma unroll
2796
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2940
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2797
2941
  int i = i0 + i_offset;
2798
2942
 
2799
2943
  if (need_check) {
@@ -2801,16 +2945,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2801
2945
  }
2802
2946
 
2803
2947
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2948
+ const int ky = QR5_K*kqsx;
2804
2949
 
2805
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2950
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2951
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2952
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2953
+
2954
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2955
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2956
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2957
+
2958
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
2959
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
2960
+
2961
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
2962
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2806
2963
  }
2807
2964
 
2808
2965
  const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2966
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
2967
 
2811
2968
  #pragma unroll
2812
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2969
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
2970
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2814
2971
 
2815
2972
  if (need_check) {
2816
2973
  i = min(i, i_max);
@@ -2822,29 +2979,24 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2822
2979
  }
2823
2980
 
2824
2981
  #pragma unroll
2825
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2983
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2827
2984
 
2828
2985
  if (need_check) {
2829
2986
  i = min(i, i_max);
2830
2987
  }
2831
2988
 
2832
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
-
2834
- x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
- }
2989
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2836
2990
 
2837
- #pragma unroll
2838
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2991
+ const int * scales = (int *) bxi->scales;
2840
2992
 
2841
- if (need_check) {
2842
- i = min(i, i_max);
2843
- }
2993
+ const int ksc = k % (WARP_SIZE/8);
2844
2994
 
2845
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2995
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2996
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2997
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2846
2998
 
2847
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2999
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2848
3000
  }
2849
3001
  }
2850
3002
 
@@ -2852,77 +3004,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
3004
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
3005
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
3006
 
2855
- __builtin_assume(i >= 0);
2856
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
- __builtin_assume(j >= 0);
2858
- __builtin_assume(j < WARP_SIZE);
2859
- __builtin_assume(k >= 0);
2860
- __builtin_assume(k < WARP_SIZE);
2861
-
2862
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
-
2865
- int vl[2];
2866
- int vh[2];
2867
- int u[2*QR4_K];
2868
- float d8[QR4_K];
2869
-
2870
- const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
-
2872
- vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
- vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
-
2875
- vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
- vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
-
2878
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
- uint16_t aux[2];
2880
- const int l = bq8_offset/2;
2881
- if (l < 2) {
2882
- aux[0] = scales[l+0] & 0x3f3f;
2883
- aux[1] = scales[l+2] & 0x3f3f;
2884
- } else {
2885
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
- }
2888
- const uint8_t * sc = (const uint8_t *)aux;
2889
- const uint8_t * m = sc + 2;
2890
-
2891
- for (int l = 0; l < QR5_K; ++l) {
2892
- const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
- d8[l] = y_ds[kqsy / QI8_1].x;
2896
- }
2897
-
2898
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
- }
2900
-
2901
- #define VDR_q6_K_q8_1 1
2902
-
2903
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
- const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
- const float & d, const float * __restrict__ d8) {
2906
-
2907
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
- float sumf = 0.0f;
2909
-
2910
- for (int i = 0; i < QR6_K; ++i) {
2911
- const int sc = scales[4*i];
2912
-
2913
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
2914
-
2915
- const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
2916
-
2917
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
2918
-
2919
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2920
- }
3007
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
2921
3008
 
2922
- return d*sumf;
2923
- #else
2924
- return 0.0f; // only to satisfy the compiler
2925
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3009
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
2926
3012
  }
2927
3013
 
2928
3014
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -2942,33 +3028,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2942
3028
  int u[QR6_K];
2943
3029
  float d8[QR6_K];
2944
3030
 
3031
+ #pragma unroll
2945
3032
  for (int i = 0; i < QR6_K; ++i) {
2946
3033
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
3034
  d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
2948
3035
  }
2949
3036
 
2950
- return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
3037
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
2951
3038
  }
2952
3039
 
2953
- static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3040
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2954
3041
 
2955
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
3042
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3043
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3044
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2959
3045
 
2960
3046
  *x_ql = tile_x_ql;
2961
3047
  *x_dm = tile_x_dm;
2962
- *x_qh = tile_x_qh;
2963
3048
  *x_sc = tile_x_sc;
2964
3049
  }
2965
3050
 
2966
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3051
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
3052
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
3053
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2969
3054
 
2970
3055
  __builtin_assume(i_offset >= 0);
2971
- __builtin_assume(i_offset < 8);
3056
+ __builtin_assume(i_offset < nwarps);
2972
3057
  __builtin_assume(k >= 0);
2973
3058
  __builtin_assume(k < WARP_SIZE);
2974
3059
 
@@ -2978,7 +3063,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2978
3063
  const block_q6_K * bx0 = (block_q6_K *) vx;
2979
3064
 
2980
3065
  #pragma unroll
2981
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
3066
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2982
3067
  int i = i0 + i_offset;
2983
3068
 
2984
3069
  if (need_check) {
@@ -2986,42 +3071,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2986
3071
  }
2987
3072
 
2988
3073
  const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3074
+ const int ky = QR6_K*kqsx;
2989
3075
 
2990
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
- }
2992
-
2993
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2995
-
2996
- #pragma unroll
2997
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3076
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3077
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3078
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2999
3079
 
3000
- if (need_check) {
3001
- i = min(i, i_max);
3002
- }
3080
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3081
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3082
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3003
3083
 
3004
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3084
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3085
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3005
3086
 
3006
- x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3087
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3088
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3007
3089
  }
3008
3090
 
3091
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3092
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3093
+ float * x_dmf = (float *) x_dm;
3094
+
3009
3095
  #pragma unroll
3010
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
3096
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3097
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3012
3098
 
3013
3099
  if (need_check) {
3014
3100
  i = min(i, i_max);
3015
3101
  }
3016
3102
 
3017
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
3103
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3018
3104
 
3019
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3105
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3020
3106
  }
3021
3107
 
3022
3108
  #pragma unroll
3023
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3110
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3025
3111
 
3026
3112
  if (need_check) {
3027
3113
  i = min(i, i_max);
@@ -3037,39 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
3123
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
3124
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3039
3125
 
3040
- __builtin_assume(i >= 0);
3041
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
- __builtin_assume(j >= 0);
3043
- __builtin_assume(j < WARP_SIZE);
3044
- __builtin_assume(k >= 0);
3045
- __builtin_assume(k < WARP_SIZE);
3046
-
3047
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
- const int kqsx = k % QI6_K; // == k if QK_K == 256
3049
-
3050
- const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
- const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
- const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
3053
-
3054
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
3055
-
3056
- const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
- const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
3126
+ const float * x_dmf = (const float *) x_dm;
3127
+ const float * y_df = (const float *) y_ds;
3058
3128
 
3059
- int u[QR6_K];
3060
- float d8[QR6_K];
3061
-
3062
- for (int l = 0; l < QR6_K; ++l) {
3063
- const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
- u[l] = y_qs[kqsy];
3065
- d8[l] = y_ds[kqsy / QI8_1].x;
3066
- }
3129
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3067
3130
 
3068
- return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
- x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3131
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3132
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3133
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3070
3134
  }
3071
3135
 
3072
- template <int qk, int qr, int qi, typename block_q_t,
3136
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3073
3137
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
3138
  static __global__ void mul_mat_q(
3075
3139
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -3084,14 +3148,11 @@ static __global__ void mul_mat_q(
3084
3148
 
3085
3149
  const int & ncols_dst = ncols_y;
3086
3150
 
3087
- const int tid_x = threadIdx.x;
3088
- const int tid_y = threadIdx.y;
3089
-
3090
- const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3151
+ const int row_dst_0 = blockIdx.x*mmq_y;
3091
3152
  const int & row_x_0 = row_dst_0;
3092
- const int row_dst = row_dst_0 + tid_x;
3153
+ const int row_dst = row_dst_0 + threadIdx.x;
3093
3154
 
3094
- const int col_dst_0 = blockIdx.y*WARP_SIZE;
3155
+ const int col_dst_0 = blockIdx.y*mmq_x;
3095
3156
  const int & col_y_0 = col_dst_0;
3096
3157
 
3097
3158
  int * tile_x_ql = nullptr;
@@ -3101,55 +3162,65 @@ static __global__ void mul_mat_q(
3101
3162
 
3102
3163
  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3103
3164
 
3104
- const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
3105
-
3106
- __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
- __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
3165
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3166
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3108
3167
 
3109
- float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
3168
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3110
3169
 
3111
3170
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3112
3171
 
3113
3172
  load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
- tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
3173
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3115
3174
 
3175
+ #pragma unroll
3116
3176
  for (int ir = 0; ir < qr; ++ir) {
3117
- const int kqs = ir*WARP_SIZE + tid_x;
3177
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3118
3178
  const int kbxd = kqs / QI8_1;
3119
3179
 
3120
- for (int i = 0; i < WARP_SIZE; i += 8) {
3121
- const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3180
+ #pragma unroll
3181
+ for (int i = 0; i < mmq_x; i += nwarps) {
3182
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3122
3183
 
3123
3184
  const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
3124
3185
 
3125
- tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3186
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3187
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3126
3188
  }
3127
- }
3128
3189
 
3129
- for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
- const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
- const int kby = tid_x % blocks_per_tile_y_col;
3132
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
- tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
- }
3190
+ #pragma unroll
3191
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3192
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3193
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3194
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3195
+
3196
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3197
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3198
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3199
+ if (need_sum) {
3200
+ *dsi_dst = *dsi_src;
3201
+ } else {
3202
+ float * dfi_dst = (float *) dsi_dst;
3203
+ *dfi_dst = (*dsi_src).x;
3204
+ }
3205
+ }
3135
3206
 
3136
- __syncthreads();
3207
+ __syncthreads();
3137
3208
 
3138
- #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
- #pragma unroll
3140
- #endif // __CUDA_ARCH__ >= 700
3141
- for (int k = 0; k < WARP_SIZE; k += vdr) {
3209
+ // #pragma unroll // unrolling this loop causes too much register pressure
3210
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3142
3211
  #pragma unroll
3143
- for (int j = 0; j < WARP_SIZE; j += 8) {
3212
+ for (int j = 0; j < mmq_x; j += nwarps) {
3144
3213
  #pragma unroll
3145
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
- sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
- tid_x + i, tid_y + j, k);
3214
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3215
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3216
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3217
+ threadIdx.x + i, threadIdx.y + j, k);
3218
+ }
3148
3219
  }
3149
3220
  }
3150
- }
3151
3221
 
3152
- __syncthreads();
3222
+ __syncthreads();
3223
+ }
3153
3224
  }
3154
3225
 
3155
3226
 
@@ -3157,15 +3228,15 @@ static __global__ void mul_mat_q(
3157
3228
  return;
3158
3229
  }
3159
3230
 
3160
- for (int j = 0; j < WARP_SIZE; j += 8) {
3161
- const int col_dst = col_dst_0 + j + tid_y;
3231
+ for (int j = 0; j < mmq_x; j += nwarps) {
3232
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3162
3233
 
3163
3234
  if (col_dst >= ncols_dst) {
3164
3235
  return;
3165
3236
  }
3166
3237
 
3167
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3238
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
+ dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3169
3240
  }
3170
3241
  }
3171
3242
  }
@@ -3780,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
3780
3851
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3781
3852
  const dim3 block_nums(1, block_num_y, 1);
3782
3853
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3783
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
3854
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
3784
3855
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3785
3856
  }
3786
3857
 
@@ -3789,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
3789
3860
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3790
3861
  const dim3 block_nums(1, block_num_y, 1);
3791
3862
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3792
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
3863
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
3793
3864
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3794
3865
  }
3795
3866
 
@@ -3798,7 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
3798
3869
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3799
3870
  const dim3 block_nums(1, block_num_y, 1);
3800
3871
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3801
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
3872
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
3802
3873
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3803
3874
  }
3804
3875
 
@@ -3807,7 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
3807
3878
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3808
3879
  const dim3 block_nums(1, block_num_y, 1);
3809
3880
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3810
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
3881
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
3811
3882
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3812
3883
  }
3813
3884
 
@@ -3816,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
3816
3887
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3817
3888
  const dim3 block_nums(1, block_num_y, 1);
3818
3889
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3819
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
3890
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
3820
3891
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3821
3892
  }
3822
3893
 
@@ -3867,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
3938
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
3939
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
3940
 
3870
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3872
- const dim3 block_nums(block_num_x, block_num_y, 1);
3873
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3874
-
3875
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3941
+ int id;
3942
+ CUDA_CHECK(cudaGetDevice(&id));
3943
+ const int compute_capability = g_compute_capabilities[id];
3944
+
3945
+ if (compute_capability >= CC_TURING) {
3946
+ const int mmq_x = 64;
3947
+ const int mmq_y = 128;
3948
+ const int nwarps = 4;
3949
+
3950
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3953
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
+
3955
+ if (nrows_x % mmq_y == 0) {
3956
+ const bool need_check = false;
3957
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
+ } else {
3961
+ const bool need_check = true;
3962
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
+ }
3878
3966
  } else {
3879
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3967
+ const int mmq_x = 64;
3968
+ const int mmq_y = 64;
3969
+ const int nwarps = 4;
3970
+
3971
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
+ const dim3 block_nums(block_num_x, block_num_y, 1);
3974
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
+
3976
+ if (nrows_x % mmq_y == 0) {
3977
+ const bool need_check = false;
3978
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
+ } else {
3982
+ const bool need_check = true;
3983
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
+ }
3881
3987
  }
3882
3988
  }
3883
3989
 
@@ -3885,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
3991
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
3992
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
3993
 
3888
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3890
- const dim3 block_nums(block_num_x, block_num_y, 1);
3891
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3892
-
3893
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3994
+ int id;
3995
+ CUDA_CHECK(cudaGetDevice(&id));
3996
+ const int compute_capability = g_compute_capabilities[id];
3997
+
3998
+ if (compute_capability >= CC_TURING) {
3999
+ const int mmq_x = 64;
4000
+ const int mmq_y = 128;
4001
+ const int nwarps = 4;
4002
+
4003
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4006
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
+
4008
+ if (nrows_x % mmq_y == 0) {
4009
+ const bool need_check = false;
4010
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
+ } else {
4014
+ const bool need_check = true;
4015
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
+ }
3896
4019
  } else {
3897
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4020
+ const int mmq_x = 64;
4021
+ const int mmq_y = 64;
4022
+ const int nwarps = 8;
4023
+
4024
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4027
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
+
4029
+ if (nrows_x % mmq_y == 0) {
4030
+ const bool need_check = false;
4031
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
+ } else {
4035
+ const bool need_check = true;
4036
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
+ }
4040
+
3899
4041
  }
3900
4042
  }
3901
4043
 
@@ -3903,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
4045
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
4046
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
4047
 
3906
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3908
- const dim3 block_nums(block_num_x, block_num_y, 1);
3909
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3910
-
3911
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4048
+ int id;
4049
+ CUDA_CHECK(cudaGetDevice(&id));
4050
+ const int compute_capability = g_compute_capabilities[id];
4051
+
4052
+ if (compute_capability >= CC_TURING) {
4053
+ const int mmq_x = 128;
4054
+ const int mmq_y = 64;
4055
+ const int nwarps = 4;
4056
+
4057
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4060
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
+
4062
+ if (nrows_x % mmq_y == 0) {
4063
+ const bool need_check = false;
4064
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
+ } else {
4068
+ const bool need_check = true;
4069
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
+ }
3914
4073
  } else {
3915
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4074
+ const int mmq_x = 64;
4075
+ const int mmq_y = 64;
4076
+ const int nwarps = 8;
4077
+
4078
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4081
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
+
4083
+ if (nrows_x % mmq_y == 0) {
4084
+ const bool need_check = false;
4085
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
+ } else {
4089
+ const bool need_check = true;
4090
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
+ }
3917
4094
  }
3918
4095
  }
3919
4096
 
@@ -3921,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
4098
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
4099
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
4100
 
3924
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3926
- const dim3 block_nums(block_num_x, block_num_y, 1);
3927
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3928
-
3929
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4101
+ int id;
4102
+ CUDA_CHECK(cudaGetDevice(&id));
4103
+ const int compute_capability = g_compute_capabilities[id];
4104
+
4105
+ if (compute_capability >= CC_TURING) {
4106
+ const int mmq_x = 128;
4107
+ const int mmq_y = 64;
4108
+ const int nwarps = 8;
4109
+
4110
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4113
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
+
4115
+ if (nrows_x % mmq_y == 0) {
4116
+ const bool need_check = false;
4117
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
+ } else {
4121
+ const bool need_check = true;
4122
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
+ }
3932
4126
  } else {
3933
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4127
+ const int mmq_x = 64;
4128
+ const int mmq_y = 64;
4129
+ const int nwarps = 8;
4130
+
4131
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4134
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
+
4136
+ if (nrows_x % mmq_y == 0) {
4137
+ const bool need_check = false;
4138
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
+ } else {
4142
+ const bool need_check = true;
4143
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
+ }
3935
4147
  }
3936
4148
  }
3937
4149
 
@@ -3939,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
4151
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
4152
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
4153
 
3942
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3944
- const dim3 block_nums(block_num_x, block_num_y, 1);
3945
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3946
-
3947
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4154
+ int id;
4155
+ CUDA_CHECK(cudaGetDevice(&id));
4156
+ const int compute_capability = g_compute_capabilities[id];
4157
+
4158
+ if (compute_capability >= CC_TURING) {
4159
+ const int mmq_x = 128;
4160
+ const int mmq_y = 64;
4161
+ const int nwarps = 4;
4162
+
4163
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4166
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
+
4168
+ if (nrows_x % mmq_y == 0) {
4169
+ const bool need_check = false;
4170
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
+ } else {
4174
+ const bool need_check = true;
4175
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
+ }
3950
4179
  } else {
3951
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4180
+ const int mmq_x = 64;
4181
+ const int mmq_y = 64;
4182
+ const int nwarps = 8;
4183
+
4184
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4187
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
+
4189
+ if (nrows_x % mmq_y == 0) {
4190
+ const bool need_check = false;
4191
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
+ } else {
4195
+ const bool need_check = true;
4196
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
+ }
3953
4200
  }
3954
4201
  }
3955
4202
 
@@ -3957,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
4204
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
4205
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
4206
 
3960
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3962
- const dim3 block_nums(block_num_x, block_num_y, 1);
3963
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3964
-
3965
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4207
+ int id;
4208
+ CUDA_CHECK(cudaGetDevice(&id));
4209
+ const int compute_capability = g_compute_capabilities[id];
4210
+
4211
+ if (compute_capability >= CC_TURING) {
4212
+ const int mmq_x = 64;
4213
+ const int mmq_y = 128;
4214
+ const int nwarps = 4;
4215
+
4216
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4219
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
+
4221
+ if (nrows_x % mmq_y == 0) {
4222
+ const bool need_check = false;
4223
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
+ } else {
4227
+ const bool need_check = true;
4228
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
+ }
3968
4232
  } else {
3969
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4233
+ const int mmq_x = 64;
4234
+ const int mmq_y = 64;
4235
+ const int nwarps = 8;
4236
+
4237
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4240
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
+
4242
+ if (nrows_x % mmq_y == 0) {
4243
+ const bool need_check = false;
4244
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
+ } else {
4248
+ const bool need_check = true;
4249
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
+ }
3971
4253
  }
3972
4254
  }
3973
4255
 
@@ -3975,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
4257
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
4258
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
4259
 
3978
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3980
- const dim3 block_nums(block_num_x, block_num_y, 1);
3981
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
3982
-
3983
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4260
+ int id;
4261
+ CUDA_CHECK(cudaGetDevice(&id));
4262
+ const int compute_capability = g_compute_capabilities[id];
4263
+
4264
+ if (compute_capability >= CC_TURING) {
4265
+ const int mmq_x = 128;
4266
+ const int mmq_y = 128;
4267
+ const int nwarps = 4;
4268
+
4269
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4272
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
+
4274
+ if (nrows_x % mmq_y == 0) {
4275
+ const bool need_check = false;
4276
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
+ } else {
4280
+ const bool need_check = true;
4281
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
+ }
3986
4285
  } else {
3987
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4286
+ const int mmq_x = 64;
4287
+ const int mmq_y = 64;
4288
+ const int nwarps = 8;
4289
+
4290
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4293
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
+
4295
+ if (nrows_x % mmq_y == 0) {
4296
+ const bool need_check = false;
4297
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
+ } else {
4301
+ const bool need_check = true;
4302
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
+ }
3989
4306
  }
3990
4307
  }
3991
4308
 
@@ -3993,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
4310
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
4311
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
4312
 
3996
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
3998
- const dim3 block_nums(block_num_x, block_num_y, 1);
3999
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4000
-
4001
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4313
+ int id;
4314
+ CUDA_CHECK(cudaGetDevice(&id));
4315
+ const int compute_capability = g_compute_capabilities[id];
4316
+
4317
+ if (compute_capability >= CC_TURING) {
4318
+ const int mmq_x = 64;
4319
+ const int mmq_y = 128;
4320
+ const int nwarps = 4;
4321
+
4322
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4325
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
+
4327
+ if (nrows_x % mmq_y == 0) {
4328
+ const bool need_check = false;
4329
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
+ } else {
4333
+ const bool need_check = true;
4334
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
+ }
4004
4338
  } else {
4005
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4339
+ const int mmq_x = 32;
4340
+ const int mmq_y = 64;
4341
+ const int nwarps = 8;
4342
+
4343
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4346
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
+
4348
+ if (nrows_x % mmq_y == 0) {
4349
+ const bool need_check = false;
4350
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
+ } else {
4354
+ const bool need_check = true;
4355
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
+ }
4007
4359
  }
4008
4360
  }
4009
4361
 
@@ -4011,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
4363
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
4364
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
4365
 
4014
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4016
- const dim3 block_nums(block_num_x, block_num_y, 1);
4017
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4018
-
4019
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4366
+ int id;
4367
+ CUDA_CHECK(cudaGetDevice(&id));
4368
+ const int compute_capability = g_compute_capabilities[id];
4369
+
4370
+ if (compute_capability >= CC_TURING) {
4371
+ const int mmq_x = 64;
4372
+ const int mmq_y = 128;
4373
+ const int nwarps = 4;
4374
+
4375
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4378
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
+
4380
+ if (nrows_x % mmq_y == 0) {
4381
+ const bool need_check = false;
4382
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
+ } else {
4386
+ const bool need_check = true;
4387
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
+ }
4022
4391
  } else {
4023
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4392
+ const int mmq_x = 64;
4393
+ const int mmq_y = 64;
4394
+ const int nwarps = 8;
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ } else {
4407
+ const bool need_check = true;
4408
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
+ }
4025
4412
  }
4026
4413
  }
4027
4414
 
@@ -4029,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
4416
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
4417
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
4418
 
4032
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4034
- const dim3 block_nums(block_num_x, block_num_y, 1);
4035
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4036
-
4037
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4419
+ int id;
4420
+ CUDA_CHECK(cudaGetDevice(&id));
4421
+ const int compute_capability = g_compute_capabilities[id];
4422
+
4423
+ if (compute_capability >= CC_TURING) {
4424
+ const int mmq_x = 64;
4425
+ const int mmq_y = 64;
4426
+ const int nwarps = 4;
4427
+
4428
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4431
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
+
4433
+ if (nrows_x % mmq_y == 0) {
4434
+ const bool need_check = false;
4435
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ } else {
4439
+ const bool need_check = true;
4440
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ }
4040
4444
  } else {
4041
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4445
+ const int mmq_x = 32;
4446
+ const int mmq_y = 64;
4447
+ const int nwarps = 8;
4448
+
4449
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4452
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
+
4454
+ if (nrows_x % mmq_y == 0) {
4455
+ const bool need_check = false;
4456
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
+ } else {
4460
+ const bool need_check = true;
4461
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
+ }
4043
4465
  }
4044
4466
  }
4045
4467
 
@@ -4214,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
4214
4636
  }
4215
4637
 
4216
4638
 
4217
- static void * g_scratch_buffer = nullptr;
4218
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
4219
- static size_t g_scratch_offset = 0;
4220
-
4221
- static int g_device_count = -1;
4222
- static int g_main_device = 0;
4223
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
4224
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
- static bool g_mul_mat_q = false;
4226
-
4227
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
4228
-
4229
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
4230
-
4231
4639
  void ggml_init_cublas() {
4232
4640
  static bool initialized = false;
4233
4641
 
@@ -4583,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q(
4583
4991
  (void) i1;
4584
4992
  }
4585
4993
 
4994
+ static int64_t get_row_rounding(ggml_type type) {
4995
+ int max_compute_capability = INT_MIN;
4996
+ for (int id = 0; id < g_device_count; ++id) {
4997
+ if (max_compute_capability < g_compute_capabilities[id]
4998
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
4999
+ max_compute_capability = g_compute_capabilities[id];
5000
+ }
5001
+ }
5002
+
5003
+ switch(type) {
5004
+ case GGML_TYPE_Q4_0:
5005
+ case GGML_TYPE_Q4_1:
5006
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5007
+ case GGML_TYPE_Q5_0:
5008
+ case GGML_TYPE_Q5_1:
5009
+ case GGML_TYPE_Q8_0:
5010
+ return 64;
5011
+ case GGML_TYPE_F16:
5012
+ return 1;
5013
+ case GGML_TYPE_Q2_K:
5014
+ case GGML_TYPE_Q3_K:
5015
+ case GGML_TYPE_Q4_K:
5016
+ case GGML_TYPE_Q5_K:
5017
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5018
+ case GGML_TYPE_Q6_K:
5019
+ return 64;
5020
+ default:
5021
+ GGML_ASSERT(false);
5022
+ }
5023
+ }
5024
+
4586
5025
  inline void ggml_cuda_op_mul_mat_vec(
4587
5026
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4588
5027
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -4983,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
4983
5422
 
4984
5423
  int64_t row_low, row_high;
4985
5424
  if (split) {
5425
+ const int64_t rounding = get_row_rounding(src0->type);
5426
+
4986
5427
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
4987
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5428
+ row_low -= row_low % rounding;
4988
5429
 
4989
5430
  if (id == g_device_count - 1) {
4990
5431
  row_high = nrows0;
4991
5432
  } else {
4992
5433
  row_high = nrows0*g_tensor_split[id + 1];
4993
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5434
+ row_high -= row_high % rounding;
4994
5435
  }
4995
5436
  } else {
4996
5437
  row_low = 0;
@@ -5203,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
5203
5644
  if (split && g_device_count > 1) {
5204
5645
  CUDA_CHECK(cudaSetDevice(g_main_device));
5205
5646
  for (int id = 0; id < g_device_count; ++id) {
5206
- if (id != g_main_device) {
5647
+ if (id != g_main_device && src0_extra->events[id]) {
5207
5648
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
5208
5649
  }
5209
5650
  }
@@ -5347,7 +5788,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
5347
5788
  } else {
5348
5789
  int min_compute_capability = INT_MAX;
5349
5790
  for (int id = 0; id < g_device_count; ++id) {
5350
- if (min_compute_capability > g_compute_capabilities[id]) {
5791
+ if (min_compute_capability > g_compute_capabilities[id]
5792
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5351
5793
  min_compute_capability = g_compute_capabilities[id];
5352
5794
  }
5353
5795
  }
@@ -5468,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
5468
5910
  row_low = 0;
5469
5911
  row_high = nrows;
5470
5912
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
5913
+ const int64_t rounding = get_row_rounding(tensor->type);
5914
+
5471
5915
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
5472
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5916
+ row_low -= row_low % rounding;
5473
5917
 
5474
5918
  if (id == g_device_count - 1) {
5475
5919
  row_high = nrows;
5476
5920
  } else {
5477
5921
  row_high = nrows*g_tensor_split[id + 1];
5478
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5922
+ row_high -= row_high % rounding;
5479
5923
  }
5480
5924
  } else {
5481
5925
  GGML_ASSERT(false);