llama_cpp 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
262
263
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
263
264
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
264
265
 
265
- #ifndef GGML_CUDA_MMQ_Y
266
- #define GGML_CUDA_MMQ_Y 64
267
- #endif // GGML_CUDA_MMQ_Y
268
-
269
266
  // dmmv = dequantize_mul_mat_vec
270
267
  #ifndef GGML_CUDA_DMMV_X
271
268
  #define GGML_CUDA_DMMV_X 32
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
285
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
286
283
  };
287
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
288
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
289
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
290
301
 
@@ -1383,9 +1394,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1383
1394
  sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
1395
  }
1385
1396
 
1397
+ const float2 ds8f = __half22float2(ds8);
1398
+
1386
1399
  // second part effectively subtracts 8 from each quant value
1387
- return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1388
1401
  #else
1402
+ assert(false);
1389
1403
  return 0.0f; // only to satisfy the compiler
1390
1404
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1391
1405
  }
@@ -1410,17 +1424,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1410
1424
  }
1411
1425
 
1412
1426
  #ifdef GGML_CUDA_F16
1413
- const half2 tmp = __hmul2(dm4, ds8);
1414
- const float d4d8 = __half2float(tmp.x);
1415
- const float m4s8 = __half2float(tmp.y);
1427
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1428
+ const float d4d8 = tmp.x;
1429
+ const float m4s8 = tmp.y;
1416
1430
  #else
1417
- const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
- const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1431
+ const float2 dm4f = __half22float2(dm4);
1432
+ const float2 ds8f = __half22float2(ds8);
1433
+ const float d4d8 = dm4f.x * ds8f.x;
1434
+ const float m4s8 = dm4f.y * ds8f.y;
1419
1435
  #endif // GGML_CUDA_F16
1420
1436
 
1421
1437
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1422
1438
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1423
1439
  #else
1440
+ assert(false);
1424
1441
  return 0.0f; // only to satisfy the compiler
1425
1442
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1426
1443
  }
@@ -1434,6 +1451,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1434
1451
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1435
1452
  int sumi = 0;
1436
1453
 
1454
+ #pragma unroll
1437
1455
  for (int i = 0; i < vdr; ++i) {
1438
1456
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
1457
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1450,9 +1468,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1450
1468
  sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
1469
  }
1452
1470
 
1471
+ const float2 ds8f = __half22float2(ds8);
1472
+
1453
1473
  // second part effectively subtracts 16 from each quant value
1454
- return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1474
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1455
1475
  #else
1476
+ assert(false);
1456
1477
  return 0.0f; // only to satisfy the compiler
1457
1478
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1458
1479
  }
@@ -1466,6 +1487,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1466
1487
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1467
1488
  int sumi = 0;
1468
1489
 
1490
+ #pragma unroll
1469
1491
  for (int i = 0; i < vdr; ++i) {
1470
1492
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
1493
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1483,18 +1505,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1483
1505
  }
1484
1506
 
1485
1507
  #ifdef GGML_CUDA_F16
1486
- const half2 tmp = __hmul2(dm5, ds8);
1487
- const float d5d8 = __half2float(tmp.x);
1488
- const float m5s8 = __half2float(tmp.y);
1508
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1509
+ const float d5d8 = tmp.x;
1510
+ const float m5s8 = tmp.y;
1489
1511
  #else
1490
- const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
- const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1512
+ const float2 dm5f = __half22float2(dm5);
1513
+ const float2 ds8f = __half22float2(ds8);
1514
+ const float d5d8 = dm5f.x * ds8f.x;
1515
+ const float m5s8 = dm5f.y * ds8f.y;
1492
1516
  #endif // GGML_CUDA_F16
1493
1517
 
1494
1518
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1495
1519
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1496
1520
 
1497
1521
  #else
1522
+ assert(false);
1498
1523
  return 0.0f; // only to satisfy the compiler
1499
1524
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1500
1525
  }
@@ -1503,18 +1528,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1503
1528
  #define VDR_Q8_0_Q8_1_MMQ 8
1504
1529
 
1505
1530
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
- const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1531
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1507
1532
 
1508
1533
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
1534
  int sumi = 0;
1510
1535
 
1536
+ #pragma unroll
1511
1537
  for (int i = 0; i < vdr; ++i) {
1512
1538
  // SIMD dot product of quantized values
1513
1539
  sumi = __dp4a(v[i], u[i], sumi);
1514
1540
  }
1515
1541
 
1516
- return sumi * d8_0 * __half2float(ds8_1.x);
1542
+ return d8_0*d8_1 * sumi;
1517
1543
  #else
1544
+ assert(false);
1518
1545
  return 0.0f; // only to satisfy the compiler
1519
1546
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1520
1547
  }
@@ -1525,23 +1552,374 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1525
1552
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1526
1553
  int sumi = 0;
1527
1554
 
1555
+ #pragma unroll
1528
1556
  for (int i = 0; i < vdr; ++i) {
1529
1557
  // SIMD dot product of quantized values
1530
1558
  sumi = __dp4a(v[i], u[i], sumi);
1531
1559
  }
1532
1560
 
1533
1561
  #ifdef GGML_CUDA_F16
1534
- const half2 tmp = __hmul2(dm8, ds8);
1535
- const float d8d8 = __half2float(tmp.x);
1536
- const float m8s8 = __half2float(tmp.y);
1562
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1563
+ const float d8d8 = tmp.x;
1564
+ const float m8s8 = tmp.y;
1537
1565
  #else
1538
- const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
- const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1566
+ const float2 dm8f = __half22float2(dm8);
1567
+ const float2 ds8f = __half22float2(ds8);
1568
+ const float d8d8 = dm8f.x * ds8f.x;
1569
+ const float m8s8 = dm8f.y * ds8f.y;
1540
1570
  #endif // GGML_CUDA_F16
1541
1571
 
1542
1572
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1543
1573
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1544
1574
  #else
1575
+ assert(false);
1576
+ return 0.0f; // only to satisfy the compiler
1577
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1578
+ }
1579
+
1580
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1581
+ #define VDR_Q2_K_Q8_1_MMQ 2
1582
+
1583
+ // contiguous v/x values
1584
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1585
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1586
+ const half2 & dm2, const float * __restrict__ d8) {
1587
+
1588
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1589
+ float sumf_d = 0.0f;
1590
+ float sumf_m = 0.0f;
1591
+
1592
+ #pragma unroll
1593
+ for (int i = 0; i < QR2_K; ++i) {
1594
+ const int sc = scales[2*i];
1595
+
1596
+ const int vi = (v >> (2*i)) & 0x03030303;
1597
+
1598
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1599
+
1600
+ // fill int with 4x m
1601
+ int m = sc >> 4;
1602
+ m |= m << 8;
1603
+ m |= m << 16;
1604
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1605
+ }
1606
+
1607
+ const float2 dm2f = __half22float2(dm2);
1608
+
1609
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1610
+ #else
1611
+ assert(false);
1612
+ return 0.0f; // only to satisfy the compiler
1613
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1614
+ }
1615
+
1616
+ // contiguous u/y values
1617
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1618
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1619
+ const half2 & dm2, const float & d8) {
1620
+
1621
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1622
+ int sumi_d = 0;
1623
+ int sumi_m = 0;
1624
+
1625
+ #pragma unroll
1626
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1627
+ int sumi_d_sc = 0;
1628
+
1629
+ const int sc = scales[i0 / (QI8_1/2)];
1630
+
1631
+ // fill int with 4x m
1632
+ int m = sc >> 4;
1633
+ m |= m << 8;
1634
+ m |= m << 16;
1635
+
1636
+ #pragma unroll
1637
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1638
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1639
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1640
+ }
1641
+
1642
+ sumi_d += sumi_d_sc * (sc & 0xF);
1643
+ }
1644
+
1645
+ const float2 dm2f = __half22float2(dm2);
1646
+
1647
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1648
+ #else
1649
+ assert(false);
1650
+ return 0.0f; // only to satisfy the compiler
1651
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1652
+ }
1653
+
1654
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1655
+ #define VDR_Q3_K_Q8_1_MMQ 2
1656
+
1657
+ // contiguous v/x values
1658
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1659
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1660
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1661
+
1662
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1663
+ float sumf = 0.0f;
1664
+
1665
+ #pragma unroll
1666
+ for (int i = 0; i < QR3_K; ++i) {
1667
+ const int isc = scale_offset + 2*i;
1668
+
1669
+ const int isc_low = isc % (QK_K/32);
1670
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1671
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1672
+
1673
+ const int isc_high = isc % (QK_K/64);
1674
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1675
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1676
+
1677
+ const int sc = (sc_low | sc_high) - 32;
1678
+
1679
+ const int vil = (vl >> (2*i)) & 0x03030303;
1680
+
1681
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1682
+
1683
+ const int vi = __vsubss4(vil, vih);
1684
+
1685
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1686
+ }
1687
+
1688
+ return d3 * sumf;
1689
+ #else
1690
+ assert(false);
1691
+ return 0.0f; // only to satisfy the compiler
1692
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1693
+ }
1694
+
1695
+ // contiguous u/y values
1696
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1697
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1698
+ const float & d3, const float & d8) {
1699
+
1700
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1701
+ int sumi = 0;
1702
+
1703
+ #pragma unroll
1704
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1705
+ int sumi_sc = 0;
1706
+
1707
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1708
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1709
+ }
1710
+
1711
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1712
+ }
1713
+
1714
+ return d3*d8 * sumi;
1715
+ #else
1716
+ assert(false);
1717
+ return 0.0f; // only to satisfy the compiler
1718
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1719
+ }
1720
+
1721
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1722
+ #define VDR_Q4_K_Q8_1_MMQ 8
1723
+
1724
+ // contiguous v/x values
1725
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1726
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1727
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1728
+
1729
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1730
+ float sumf_d = 0.0f;
1731
+ float sumf_m = 0.0f;
1732
+
1733
+ #pragma unroll
1734
+ for (int i = 0; i < QR4_K; ++i) {
1735
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1736
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1737
+
1738
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1739
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1740
+
1741
+ sumf_d += d8[i] * (dot1 * sc[i]);
1742
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1743
+ }
1744
+
1745
+ const float2 dm4f = __half22float2(dm4);
1746
+
1747
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1748
+
1749
+ #else
1750
+ assert(false);
1751
+ return 0.0f; // only to satisfy the compiler
1752
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1753
+ }
1754
+
1755
+ // contiguous u/y values
1756
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1757
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1758
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1759
+
1760
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1761
+ float sumf_d = 0.0f;
1762
+ float sumf_m = 0.0f;
1763
+
1764
+ #pragma unroll
1765
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1766
+ int sumi_d = 0;
1767
+
1768
+ #pragma unroll
1769
+ for (int j = 0; j < QI8_1; ++j) {
1770
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1771
+ }
1772
+
1773
+ const float2 ds8f = __half22float2(ds8[i]);
1774
+
1775
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1776
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1777
+ }
1778
+
1779
+ const float2 dm4f = __half22float2(dm4);
1780
+
1781
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1782
+
1783
+ #else
1784
+ assert(false);
1785
+ return 0.0f; // only to satisfy the compiler
1786
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1787
+ }
1788
+
1789
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1790
+ #define VDR_Q5_K_Q8_1_MMQ 8
1791
+
1792
+ // contiguous v/x values
1793
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1794
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1795
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1796
+
1797
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1798
+ float sumf_d = 0.0f;
1799
+ float sumf_m = 0.0f;
1800
+
1801
+ #pragma unroll
1802
+ for (int i = 0; i < QR5_K; ++i) {
1803
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1804
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1805
+
1806
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1807
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1808
+
1809
+ const int v0i = vl0i | vh0i;
1810
+ const int v1i = vl1i | vh1i;
1811
+
1812
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1813
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1814
+
1815
+ sumf_d += d8[i] * (dot1 * sc[i]);
1816
+ sumf_m += d8[i] * (dot2 * m[i]);
1817
+
1818
+ }
1819
+
1820
+ const float2 dm5f = __half22float2(dm5);
1821
+
1822
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1823
+
1824
+ #else
1825
+ assert(false);
1826
+ return 0.0f; // only to satisfy the compiler
1827
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1828
+ }
1829
+
1830
+ // contiguous u/y values
1831
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1832
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1833
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1834
+
1835
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1836
+ float sumf_d = 0.0f;
1837
+ float sumf_m = 0.0f;
1838
+
1839
+ #pragma unroll
1840
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1841
+ int sumi_d = 0;
1842
+
1843
+ #pragma unroll
1844
+ for (int j = 0; j < QI8_1; ++j) {
1845
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1846
+ }
1847
+
1848
+ const float2 ds8f = __half22float2(ds8[i]);
1849
+
1850
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1851
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1852
+ }
1853
+
1854
+ const float2 dm4f = __half22float2(dm4);
1855
+
1856
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1857
+
1858
+ #else
1859
+ assert(false);
1860
+ return 0.0f; // only to satisfy the compiler
1861
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1862
+ }
1863
+
1864
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1865
+ #define VDR_Q6_K_Q8_1_MMQ 8
1866
+
1867
+ // contiguous v/x values
1868
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1869
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1870
+ const float & d, const float * __restrict__ d8) {
1871
+
1872
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1873
+ float sumf = 0.0f;
1874
+
1875
+ #pragma unroll
1876
+ for (int i = 0; i < QR6_K; ++i) {
1877
+ const int sc = scales[4*i];
1878
+
1879
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1880
+
1881
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1882
+
1883
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1884
+
1885
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1886
+ }
1887
+
1888
+ return d*sumf;
1889
+ #else
1890
+ assert(false);
1891
+ return 0.0f; // only to satisfy the compiler
1892
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1893
+ }
1894
+
1895
+ // contiguous u/y values
1896
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1897
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1898
+ const float & d6, const float * __restrict__ d8) {
1899
+
1900
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1901
+ float sumf_d = 0.0f;
1902
+
1903
+ #pragma unroll
1904
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1905
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1906
+
1907
+ #pragma unroll
1908
+ for (int i = i0; i < i0 + 2; ++i) {
1909
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1910
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1911
+
1912
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1913
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1914
+ }
1915
+
1916
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1917
+ }
1918
+
1919
+ return d6 * sumf_d;
1920
+
1921
+ #else
1922
+ assert(false);
1545
1923
  return 0.0f; // only to satisfy the compiler
1546
1924
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
1925
  }
@@ -1564,21 +1942,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1564
1942
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
1943
  }
1566
1944
 
1567
- static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1945
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
1946
 
1569
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1947
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1948
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1571
1949
 
1572
1950
  *x_ql = tile_x_qs;
1573
1951
  *x_dm = (half2 *) tile_x_d;
1574
1952
  }
1575
1953
 
1576
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1954
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
1955
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
1956
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
1957
 
1580
1958
  __builtin_assume(i_offset >= 0);
1581
- __builtin_assume(i_offset < 8);
1959
+ __builtin_assume(i_offset < nwarps);
1582
1960
  __builtin_assume(k >= 0);
1583
1961
  __builtin_assume(k < WARP_SIZE);
1584
1962
 
@@ -1590,7 +1968,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1590
1968
  float * x_dmf = (float *) x_dm;
1591
1969
 
1592
1970
  #pragma unroll
1593
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1971
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1594
1972
  int i = i0 + i_offset;
1595
1973
 
1596
1974
  if (need_check) {
@@ -1600,38 +1978,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1600
1978
  const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
1979
 
1602
1980
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1981
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1604
1982
  }
1605
1983
 
1606
- // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
- // const int kbxd = k % blocks_per_tile_x_row;
1984
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1985
+ const int kbxd = k % blocks_per_tile_x_row;
1608
1986
 
1609
- // #pragma unroll
1610
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
- // FIXME out-of-bounds
1612
- // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1987
+ #pragma unroll
1988
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1989
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
1990
 
1614
- // if (i >= GGML_CUDA_MMQ_Y) {
1615
- // return;
1616
- // }
1991
+ if (need_check) {
1992
+ i = min(i, i_max);
1993
+ }
1617
1994
 
1618
- // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1995
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
1996
 
1620
- // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
- // }
1997
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1998
+ }
1622
1999
  }
1623
2000
 
1624
2001
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
2002
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
2003
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1627
2004
 
1628
- __builtin_assume(i >= 0);
1629
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
- __builtin_assume(j >= 0);
1631
- __builtin_assume(j < WARP_SIZE);
1632
- __builtin_assume(k >= 0);
1633
- __builtin_assume(k < WARP_SIZE);
1634
-
1635
2005
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
2006
  const float * x_dmf = (float *) x_dm;
1637
2007
 
@@ -1639,13 +2009,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1639
2009
 
1640
2010
  #pragma unroll
1641
2011
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
2012
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2013
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1644
2014
  }
1645
2015
 
1646
2016
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
2017
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2018
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1649
2019
  }
1650
2020
 
1651
2021
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -1666,21 +2036,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1666
2036
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
2037
  }
1668
2038
 
1669
- static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2039
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1670
2040
 
1671
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
2041
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2042
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1673
2043
 
1674
2044
  *x_ql = tile_x_qs;
1675
2045
  *x_dm = tile_x_dm;
1676
2046
  }
1677
2047
 
1678
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2048
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
2049
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
2050
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
2051
 
1682
2052
  __builtin_assume(i_offset >= 0);
1683
- __builtin_assume(i_offset < 8);
2053
+ __builtin_assume(i_offset < nwarps);
1684
2054
  __builtin_assume(k >= 0);
1685
2055
  __builtin_assume(k < WARP_SIZE);
1686
2056
 
@@ -1690,7 +2060,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1690
2060
  const block_q4_1 * bx0 = (block_q4_1 *) vx;
1691
2061
 
1692
2062
  #pragma unroll
1693
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2063
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1694
2064
  int i = i0 + i_offset;
1695
2065
 
1696
2066
  if (need_check) {
@@ -1706,7 +2076,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1706
2076
  const int kbxd = k % blocks_per_tile_x_row;
1707
2077
 
1708
2078
  #pragma unroll
1709
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
2079
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
1710
2080
  int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
2081
 
1712
2082
  if (need_check) {
@@ -1723,26 +2093,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
2093
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
2094
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1725
2095
 
1726
- __builtin_assume(i >= 0);
1727
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
- __builtin_assume(j >= 0);
1729
- __builtin_assume(j < WARP_SIZE);
1730
- __builtin_assume(k >= 0);
1731
- __builtin_assume(k < WARP_SIZE);
1732
-
1733
2096
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1734
2097
 
1735
2098
  int u[2*VDR_Q4_1_Q8_1_MMQ];
1736
2099
 
1737
2100
  #pragma unroll
1738
2101
  for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
2102
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2103
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
1741
2104
  }
1742
2105
 
1743
2106
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
2107
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2108
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1746
2109
  }
1747
2110
 
1748
2111
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -1765,21 +2128,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1765
2128
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
2129
  }
1767
2130
 
1768
- static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2131
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
2132
 
1770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
2133
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2134
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
1772
2135
 
1773
2136
  *x_ql = tile_x_ql;
1774
2137
  *x_dm = (half2 *) tile_x_d;
1775
2138
  }
1776
2139
 
1777
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2140
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
2141
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
2142
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
2143
 
1781
2144
  __builtin_assume(i_offset >= 0);
1782
- __builtin_assume(i_offset < 8);
2145
+ __builtin_assume(i_offset < nwarps);
1783
2146
  __builtin_assume(k >= 0);
1784
2147
  __builtin_assume(k < WARP_SIZE);
1785
2148
 
@@ -1789,7 +2152,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1789
2152
  const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
2153
 
1791
2154
  #pragma unroll
1792
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2155
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1793
2156
  int i = i0 + i_offset;
1794
2157
 
1795
2158
  if (need_check) {
@@ -1825,7 +2188,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1825
2188
  float * x_dmf = (float *) x_dm;
1826
2189
 
1827
2190
  #pragma unroll
1828
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
2191
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
1829
2192
  int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
2193
 
1831
2194
  if (need_check) {
@@ -1842,27 +2205,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
2205
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
2206
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
2207
 
1845
- __builtin_assume(i >= 0);
1846
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
- __builtin_assume(j >= 0);
1848
- __builtin_assume(j < WARP_SIZE);
1849
- __builtin_assume(k >= 0);
1850
- __builtin_assume(k < WARP_SIZE);
1851
-
1852
2208
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
2209
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
- const float * x_dmf = (float *) x_dm;
2210
+ const float * x_dmf = (const float *) x_dm;
2211
+ const float * y_df = (const float *) y_ds;
1855
2212
 
1856
2213
  int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
2214
 
1858
2215
  #pragma unroll
1859
2216
  for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
2217
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2218
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
1862
2219
  }
1863
2220
 
1864
2221
  return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2222
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1866
2223
  }
1867
2224
 
1868
2225
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -1885,21 +2242,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1885
2242
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
2243
  }
1887
2244
 
1888
- static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2245
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
2246
 
1890
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
2247
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2248
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
1892
2249
 
1893
2250
  *x_ql = tile_x_ql;
1894
2251
  *x_dm = tile_x_dm;
1895
2252
  }
1896
2253
 
1897
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2254
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
2255
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
2256
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
2257
 
1901
2258
  __builtin_assume(i_offset >= 0);
1902
- __builtin_assume(i_offset < 8);
2259
+ __builtin_assume(i_offset < nwarps);
1903
2260
  __builtin_assume(k >= 0);
1904
2261
  __builtin_assume(k < WARP_SIZE);
1905
2262
 
@@ -1909,7 +2266,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1909
2266
  const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
2267
 
1911
2268
  #pragma unroll
1912
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2269
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1913
2270
  int i = i0 + i_offset;
1914
2271
 
1915
2272
  if (need_check) {
@@ -1942,7 +2299,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1942
2299
  const int kbxd = k % blocks_per_tile_x_row;
1943
2300
 
1944
2301
  #pragma unroll
1945
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
2302
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
1946
2303
  int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
2304
 
1948
2305
  if (need_check) {
@@ -1959,13 +2316,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
2316
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
2317
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
2318
 
1962
- __builtin_assume(i >= 0);
1963
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
- __builtin_assume(j >= 0);
1965
- __builtin_assume(j < WARP_SIZE);
1966
- __builtin_assume(k >= 0);
1967
- __builtin_assume(k < WARP_SIZE);
1968
-
1969
2319
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
2320
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
2321
 
@@ -1973,12 +2323,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1973
2323
 
1974
2324
  #pragma unroll
1975
2325
  for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
2326
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2327
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
1978
2328
  }
1979
2329
 
1980
2330
  return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2331
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1982
2332
  }
1983
2333
 
1984
2334
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -1989,29 +2339,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1989
2339
  int v[VDR_Q8_0_Q8_1_MMVQ];
1990
2340
  int u[VDR_Q8_0_Q8_1_MMVQ];
1991
2341
 
2342
+ #pragma unroll
1992
2343
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
2344
  v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
2345
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
2346
  }
1996
2347
 
1997
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
2348
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
1998
2349
  }
1999
2350
 
2000
- static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2351
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
2352
 
2002
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2353
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2354
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2004
2355
 
2005
2356
  *x_ql = tile_x_qs;
2006
2357
  *x_dm = (half2 *) tile_x_d;
2007
2358
  }
2008
2359
 
2009
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2360
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
2361
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
2362
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
2363
 
2013
2364
  __builtin_assume(i_offset >= 0);
2014
- __builtin_assume(i_offset < 8);
2365
+ __builtin_assume(i_offset < nwarps);
2015
2366
  __builtin_assume(k >= 0);
2016
2367
  __builtin_assume(k < WARP_SIZE);
2017
2368
 
@@ -2022,7 +2373,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2022
2373
  const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
2374
 
2024
2375
  #pragma unroll
2025
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2376
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2026
2377
  int i = i0 + i_offset;
2027
2378
 
2028
2379
  if (need_check) {
@@ -2032,76 +2383,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2032
2383
  const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
2384
 
2034
2385
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
2386
  }
2037
2387
 
2038
- // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
- // const int kbxd = k % blocks_per_tile_x_row;
2388
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2389
+ const int kbxd = k % blocks_per_tile_x_row;
2040
2390
 
2041
- // #pragma unroll
2042
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
- // FIXME out-of-bounds
2044
- // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2391
+ #pragma unroll
2392
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2393
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
2394
 
2046
- // #if GGML_CUDA_MMQ_Y < 64
2047
- // if (i >= GGML_CUDA_MMQ_Y) {
2048
- // return;
2049
- // }
2050
- // #endif // GGML_CUDA_MMQ_Y < 64
2395
+ if (need_check) {
2396
+ i = min(i, i_max);
2397
+ }
2051
2398
 
2052
- // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2399
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
2400
 
2054
- // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
- // }
2401
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2402
+ }
2056
2403
  }
2057
2404
 
2058
2405
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
2406
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
2407
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
2408
 
2062
- __builtin_assume(i >= 0);
2063
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
- __builtin_assume(j >= 0);
2065
- __builtin_assume(j < WARP_SIZE);
2066
- __builtin_assume(k >= 0);
2067
- __builtin_assume(k < WARP_SIZE);
2068
-
2069
- const float * x_dmf = (float *) x_dm;
2409
+ const float * x_dmf = (const float *) x_dm;
2410
+ const float * y_df = (const float *) y_ds;
2070
2411
 
2071
2412
  return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
2413
  (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
- y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
- }
2075
-
2076
- #define VDR_q2_K_q8_1 1
2077
-
2078
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
- const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
- const half2 & dm, const float * __restrict__ d8) {
2081
-
2082
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
- float sumf_d = 0.0f;
2084
- float sumf_m = 0.0f;
2085
-
2086
- for (int i = 0; i < QR2_K; ++i) {
2087
- const int sc = scales[2*i];
2088
-
2089
- const int vi = (v >> (2*i)) & 0x03030303;
2090
-
2091
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
-
2093
- int sc_high = sc >> 4;
2094
- sc_high |= sc_high << 8;
2095
- sc_high |= sc_high << 16;
2096
- sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
- }
2098
-
2099
- const float2 dmf = __half22float2(dm);
2100
-
2101
- return dmf.x*sumf_d - dmf.y*sumf_m;
2102
- #else
2103
- return 0.0f; // only to satisfy the compiler
2104
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2414
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2105
2415
  }
2106
2416
 
2107
2417
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
@@ -2115,34 +2425,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2115
2425
  const uint8_t * scales = bq2_K->scales + scale_offset;
2116
2426
 
2117
2427
  const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
- int u[QR2_K];
2428
+ int u[QR2_K];
2119
2429
  float d8[QR2_K];
2120
2430
 
2431
+ #pragma unroll
2121
2432
  for (int i = 0; i < QR2_K; ++ i) {
2122
2433
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
2434
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
2435
  }
2125
2436
 
2126
- return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2437
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2127
2438
  }
2128
2439
 
2129
- static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2440
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
2441
 
2131
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2442
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2443
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2444
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2134
2445
 
2135
2446
  *x_ql = tile_x_ql;
2136
2447
  *x_dm = tile_x_dm;
2137
2448
  *x_sc = tile_x_sc;
2138
2449
  }
2139
2450
 
2140
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2451
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
2452
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
2453
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
2454
 
2144
2455
  __builtin_assume(i_offset >= 0);
2145
- __builtin_assume(i_offset < 8);
2456
+ __builtin_assume(i_offset < nwarps);
2146
2457
  __builtin_assume(k >= 0);
2147
2458
  __builtin_assume(k < WARP_SIZE);
2148
2459
 
@@ -2152,7 +2463,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2152
2463
  const block_q2_K * bx0 = (block_q2_K *) vx;
2153
2464
 
2154
2465
  #pragma unroll
2155
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2466
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2156
2467
  int i = i0 + i_offset;
2157
2468
 
2158
2469
  if (need_check) {
@@ -2168,8 +2479,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2168
2479
  const int kbxd = k % blocks_per_tile_x_row;
2169
2480
 
2170
2481
  #pragma unroll
2171
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2482
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2483
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2173
2484
 
2174
2485
  if (need_check) {
2175
2486
  i = min(i, i_max);
@@ -2181,7 +2492,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2181
2492
  }
2182
2493
 
2183
2494
  #pragma unroll
2184
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2495
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2185
2496
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
2497
 
2187
2498
  if (need_check) {
@@ -2198,68 +2509,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
2509
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
2510
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
2511
 
2201
- __builtin_assume(i >= 0);
2202
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
- __builtin_assume(j >= 0);
2204
- __builtin_assume(j < WARP_SIZE);
2205
- __builtin_assume(k >= 0);
2206
- __builtin_assume(k < WARP_SIZE);
2207
-
2208
- const int kbx = k / QI2_K;
2209
- const int kqsx = k % QI2_K;
2210
-
2211
- const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2512
+ const int kbx = k / QI2_K;
2513
+ const int ky = (k % QI2_K) * QR2_K;
2514
+ const float * y_df = (const float *) y_ds;
2213
2515
 
2214
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2516
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2215
2517
 
2216
- int u[QR2_K];
2217
- float d8[QR2_K];
2518
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2519
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2218
2520
 
2219
- for (int l = 0; l < QR2_K; ++ l) {
2220
- const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
- u[l] = y_qs[y_qs_index];
2222
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2521
+ #pragma unroll
2522
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2523
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2223
2524
  }
2224
2525
 
2225
- return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
- }
2227
-
2228
- #define VDR_q3_K_q8_1 1
2229
-
2230
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
- const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
- const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
-
2234
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
- float sumf = 0.0f;
2236
-
2237
- for (int i = 0; i < QR3_K; ++i) {
2238
- const int isc = scale_offset + 2*i;
2239
-
2240
- const int isc_low = isc % (QK_K/32);
2241
- const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
- const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
-
2244
- const int isc_high = isc % (QK_K/64);
2245
- const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
- const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
-
2248
- const int sc = (sc_low | sc_high) - 32;
2249
-
2250
- const int vil = (vl >> (2*i)) & 0x03030303;
2251
-
2252
- const int vih = ((vh >> i) << 2) & 0x04040404;
2253
-
2254
- const int vi = __vsubss4(vil, vih);
2526
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2255
2527
 
2256
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
- }
2258
-
2259
- return d*sumf;
2260
- #else
2261
- return 0.0f; // only to satisfy the compiler
2262
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2528
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2529
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2263
2530
  }
2264
2531
 
2265
2532
  static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
@@ -2277,23 +2544,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2277
2544
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
2545
  const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
2546
 
2280
- int u[QR3_K];
2547
+ int u[QR3_K];
2281
2548
  float d8[QR3_K];
2282
2549
 
2550
+ #pragma unroll
2283
2551
  for (int i = 0; i < QR3_K; ++i) {
2284
2552
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
2553
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
2554
  }
2287
2555
 
2288
- return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2556
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
2557
  }
2290
2558
 
2291
- static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2559
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
2560
 
2293
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2561
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2562
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2563
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2564
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2297
2565
 
2298
2566
  *x_ql = tile_x_ql;
2299
2567
  *x_dm = tile_x_dm;
@@ -2301,12 +2569,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
2301
2569
  *x_sc = tile_x_sc;
2302
2570
  }
2303
2571
 
2304
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2572
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
2573
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
2574
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
2575
 
2308
2576
  __builtin_assume(i_offset >= 0);
2309
- __builtin_assume(i_offset < 8);
2577
+ __builtin_assume(i_offset < nwarps);
2310
2578
  __builtin_assume(k >= 0);
2311
2579
  __builtin_assume(k < WARP_SIZE);
2312
2580
 
@@ -2316,7 +2584,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2316
2584
  const block_q3_K * bx0 = (block_q3_K *) vx;
2317
2585
 
2318
2586
  #pragma unroll
2319
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2587
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2320
2588
  int i = i0 + i_offset;
2321
2589
 
2322
2590
  if (need_check) {
@@ -2330,10 +2598,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2330
2598
 
2331
2599
  const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
2600
  const int kbxd = k % blocks_per_tile_x_row;
2601
+ float * x_dmf = (float *) x_dm;
2333
2602
 
2334
2603
  #pragma unroll
2335
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2604
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2605
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2337
2606
 
2338
2607
  if (need_check) {
2339
2608
  i = min(i, i_max);
@@ -2341,11 +2610,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2341
2610
 
2342
2611
  const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
2612
 
2344
- x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2613
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2345
2614
  }
2346
2615
 
2347
2616
  #pragma unroll
2348
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2617
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2349
2618
  int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
2619
 
2351
2620
  if (need_check) {
@@ -2354,11 +2623,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2354
2623
 
2355
2624
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
2625
 
2357
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2626
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2627
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
2628
  }
2359
2629
 
2360
2630
  #pragma unroll
2361
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2631
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2362
2632
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
2633
 
2364
2634
  if (need_check) {
@@ -2367,7 +2637,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2367
2637
 
2368
2638
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
2639
 
2370
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2640
+ const int ksc = k % (QI3_K/4);
2641
+
2642
+ const int ksc_low = ksc % (QI3_K/8);
2643
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2644
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2645
+
2646
+ const int ksc_high = QI3_K/8;
2647
+ const int shift_high = 2 * ksc;
2648
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2649
+
2650
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2651
+
2652
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2371
2653
  }
2372
2654
  }
2373
2655
 
@@ -2375,63 +2657,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
2657
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2658
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
2659
 
2378
- __builtin_assume(i >= 0);
2379
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
- __builtin_assume(j >= 0);
2381
- __builtin_assume(j < WARP_SIZE);
2382
- __builtin_assume(k >= 0);
2383
- __builtin_assume(k < WARP_SIZE);
2384
-
2385
2660
  const int kbx = k / QI3_K;
2386
- const int kqsx = k % QI3_K;
2661
+ const int ky = (k % QI3_K) * QR3_K;
2662
+ const float * x_dmf = (const float *) x_dm;
2663
+ const float * y_df = (const float *) y_ds;
2387
2664
 
2388
- const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2665
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2390
2666
 
2391
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2392
-
2393
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
- const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2395
-
2396
- int u[QR3_K];
2397
- float d8[QR3_K];
2398
-
2399
- for (int l = 0; l < QR3_K; ++ l) {
2400
- const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
- u[l] = y_qs[y_qs_index];
2402
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
- }
2404
-
2405
- return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
- x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
- }
2408
-
2409
- #define VDR_q4_K_q8_1 2
2410
-
2411
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
- const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
- const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
-
2415
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
- float sumf_d = 0.0f;
2417
- float sumf_m = 0.0f;
2667
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2418
2668
 
2419
- for (int i = 0; i < QR4_K; ++i) {
2420
- const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
- const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2669
+ #pragma unroll
2670
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2671
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2672
+ const int shift = 2 * ((ky % 32) / 8);
2673
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2422
2674
 
2423
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2675
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2676
+ const int vlh = (vh << 2) & 0x04040404;
2425
2677
 
2426
- sumf_d += d8[i] * (dot1 * sc[i]);
2427
- sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2678
+ v[l] = __vsubss4(vll, vlh);
2428
2679
  }
2429
2680
 
2430
- return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
-
2432
- #else
2433
- return 0.0f; // only to satisfy the compiler
2434
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2681
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2682
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2435
2683
  }
2436
2684
 
2437
2685
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
@@ -2478,7 +2726,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2478
2726
  u[2*i+1] = q8[4];
2479
2727
  }
2480
2728
 
2481
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2729
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2482
2730
 
2483
2731
  #else
2484
2732
 
@@ -2521,29 +2769,30 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2521
2769
  return dall * sumf_d - dmin * sumf_m;
2522
2770
 
2523
2771
  #else
2772
+ assert(false);
2524
2773
  return 0.0f; // only to satisfy the compiler
2525
2774
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2526
2775
 
2527
2776
  #endif
2528
2777
  }
2529
2778
 
2530
- static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2779
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
2780
 
2532
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2781
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2782
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2783
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2535
2784
 
2536
2785
  *x_ql = tile_x_ql;
2537
2786
  *x_dm = tile_x_dm;
2538
2787
  *x_sc = tile_x_sc;
2539
2788
  }
2540
2789
 
2541
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2790
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
2791
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
2792
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
2793
 
2545
2794
  __builtin_assume(i_offset >= 0);
2546
- __builtin_assume(i_offset < 8);
2795
+ __builtin_assume(i_offset < nwarps);
2547
2796
  __builtin_assume(k >= 0);
2548
2797
  __builtin_assume(k < WARP_SIZE);
2549
2798
 
@@ -2553,7 +2802,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2553
2802
  const block_q4_K * bx0 = (block_q4_K *) vx;
2554
2803
 
2555
2804
  #pragma unroll
2556
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2805
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2557
2806
  int i = i0 + i_offset;
2558
2807
 
2559
2808
  if (need_check) {
@@ -2566,11 +2815,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2566
2815
  }
2567
2816
 
2568
2817
  const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2818
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
2819
 
2571
2820
  #pragma unroll
2572
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2821
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2822
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2574
2823
 
2575
2824
  if (need_check) {
2576
2825
  i = min(i, i_max);
@@ -2582,8 +2831,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2582
2831
  }
2583
2832
 
2584
2833
  #pragma unroll
2585
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2834
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2835
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2587
2836
 
2588
2837
  if (need_check) {
2589
2838
  i = min(i, i_max);
@@ -2591,90 +2840,27 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2591
2840
 
2592
2841
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
2842
 
2594
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
- }
2596
- }
2597
-
2598
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
-
2602
- __builtin_assume(i >= 0);
2603
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
- __builtin_assume(j >= 0);
2605
- __builtin_assume(j < WARP_SIZE);
2606
- __builtin_assume(k >= 0);
2607
- __builtin_assume(k < WARP_SIZE);
2608
-
2609
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
-
2612
- int v[2];
2613
- int u[2*QR4_K];
2614
- float d8[QR4_K];
2843
+ const int * scales = (int *) bxi->scales;
2615
2844
 
2616
- // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
- const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2845
+ const int ksc = k % (WARP_SIZE/8);
2618
2846
 
2619
- v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
- v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2847
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2848
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2849
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2621
2850
 
2622
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
- uint16_t aux[2];
2624
- const int l = bq8_offset/2;
2625
- if (l < 2) {
2626
- aux[0] = scales[l+0] & 0x3f3f;
2627
- aux[1] = scales[l+2] & 0x3f3f;
2628
- } else {
2629
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2851
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2631
2852
  }
2632
- const uint8_t * sc = (const uint8_t *)aux;
2633
- const uint8_t * m = sc + 2;
2634
-
2635
- for (int l = 0; l < QR4_K; ++l) {
2636
- const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
- d8[l] = y_ds[kqsy / QI8_1].x;
2640
- }
2641
-
2642
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
2853
  }
2644
2854
 
2645
- #define VDR_q5_K_q8_1 2
2646
-
2647
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
- const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
- const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2650
-
2651
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
- float sumf_d = 0.0f;
2653
- float sumf_m = 0.0f;
2654
-
2655
- for (int i = 0; i < QR5_K; ++i) {
2656
- const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
- const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2658
-
2659
- const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
- const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2661
-
2662
- const int v0i = vl0i | vh0i;
2663
- const int v1i = vl1i | vh1i;
2664
-
2665
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2667
-
2668
- sumf_d += d8[i] * (dot1 * sc[i]);
2669
- sumf_m += d8[i] * (dot2 * m[i]);
2670
-
2671
- }
2855
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2856
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2857
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2672
2858
 
2673
- return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2859
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2674
2860
 
2675
- #else
2676
- return 0.0f; // only to satisfy the compiler
2677
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2861
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2678
2864
  }
2679
2865
 
2680
2866
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2711,6 +2897,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2711
2897
  const uint8_t * sc = (const uint8_t *)aux;
2712
2898
  const uint8_t * m = sc + 2;
2713
2899
 
2900
+ #pragma unroll
2714
2901
  for (int i = 0; i < QR5_K; ++i) {
2715
2902
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
2903
  d8[i] = bq8i->ds.x;
@@ -2720,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2720
2907
  u[2*i+1] = q8[4];
2721
2908
  }
2722
2909
 
2723
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2910
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2724
2911
 
2725
2912
  #else
2726
2913
 
@@ -2759,31 +2946,30 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2759
2946
  return d * sumf_d;
2760
2947
 
2761
2948
  #else
2949
+ assert(false);
2762
2950
  return 0.0f; // only to satisfy the compiler
2763
2951
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2764
2952
 
2765
2953
  #endif
2766
2954
  }
2767
2955
 
2768
- static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2956
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
2957
 
2770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2958
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2959
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2960
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2774
2961
 
2775
2962
  *x_ql = tile_x_ql;
2776
2963
  *x_dm = tile_x_dm;
2777
- *x_qh = tile_x_qh;
2778
2964
  *x_sc = tile_x_sc;
2779
2965
  }
2780
2966
 
2781
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2967
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
2968
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
2969
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
2970
 
2785
2971
  __builtin_assume(i_offset >= 0);
2786
- __builtin_assume(i_offset < 8);
2972
+ __builtin_assume(i_offset < nwarps);
2787
2973
  __builtin_assume(k >= 0);
2788
2974
  __builtin_assume(k < WARP_SIZE);
2789
2975
 
@@ -2793,7 +2979,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2793
2979
  const block_q5_K * bx0 = (block_q5_K *) vx;
2794
2980
 
2795
2981
  #pragma unroll
2796
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2797
2983
  int i = i0 + i_offset;
2798
2984
 
2799
2985
  if (need_check) {
@@ -2801,16 +2987,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2801
2987
  }
2802
2988
 
2803
2989
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2990
+ const int ky = QR5_K*kqsx;
2804
2991
 
2805
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2992
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2993
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2994
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2995
+
2996
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2997
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2998
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2999
+
3000
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
3001
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
3002
+
3003
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
3004
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2806
3005
  }
2807
3006
 
2808
3007
  const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3008
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
3009
 
2811
3010
  #pragma unroll
2812
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3011
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
3012
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2814
3013
 
2815
3014
  if (need_check) {
2816
3015
  i = min(i, i_max);
@@ -2822,107 +3021,37 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2822
3021
  }
2823
3022
 
2824
3023
  #pragma unroll
2825
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2827
-
2828
- if (need_check) {
2829
- i = min(i, i_max);
2830
- }
2831
-
2832
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
-
2834
- x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
- }
2836
-
2837
- #pragma unroll
2838
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3024
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3025
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2840
3026
 
2841
3027
  if (need_check) {
2842
3028
  i = min(i, i_max);
2843
- }
2844
-
2845
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2846
-
2847
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2848
- }
2849
- }
2850
-
2851
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
-
2855
- __builtin_assume(i >= 0);
2856
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
- __builtin_assume(j >= 0);
2858
- __builtin_assume(j < WARP_SIZE);
2859
- __builtin_assume(k >= 0);
2860
- __builtin_assume(k < WARP_SIZE);
2861
-
2862
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
-
2865
- int vl[2];
2866
- int vh[2];
2867
- int u[2*QR4_K];
2868
- float d8[QR4_K];
2869
-
2870
- const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
-
2872
- vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
- vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
-
2875
- vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
- vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
-
2878
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
- uint16_t aux[2];
2880
- const int l = bq8_offset/2;
2881
- if (l < 2) {
2882
- aux[0] = scales[l+0] & 0x3f3f;
2883
- aux[1] = scales[l+2] & 0x3f3f;
2884
- } else {
2885
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
- }
2888
- const uint8_t * sc = (const uint8_t *)aux;
2889
- const uint8_t * m = sc + 2;
2890
-
2891
- for (int l = 0; l < QR5_K; ++l) {
2892
- const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
- d8[l] = y_ds[kqsy / QI8_1].x;
2896
- }
2897
-
2898
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
- }
2900
-
2901
- #define VDR_q6_K_q8_1 1
2902
-
2903
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
- const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
- const float & d, const float * __restrict__ d8) {
2906
-
2907
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
- float sumf = 0.0f;
3029
+ }
2909
3030
 
2910
- for (int i = 0; i < QR6_K; ++i) {
2911
- const int sc = scales[4*i];
3031
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2912
3032
 
2913
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3033
+ const int * scales = (int *) bxi->scales;
2914
3034
 
2915
- const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
3035
+ const int ksc = k % (WARP_SIZE/8);
2916
3036
 
2917
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
3037
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
3038
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
3039
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2918
3040
 
2919
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
3041
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2920
3042
  }
3043
+ }
2921
3044
 
2922
- return d*sumf;
2923
- #else
2924
- return 0.0f; // only to satisfy the compiler
2925
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3045
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3046
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3047
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3048
+
3049
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3050
+
3051
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3052
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3053
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3054
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
2926
3055
  }
2927
3056
 
2928
3057
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -2942,33 +3071,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2942
3071
  int u[QR6_K];
2943
3072
  float d8[QR6_K];
2944
3073
 
3074
+ #pragma unroll
2945
3075
  for (int i = 0; i < QR6_K; ++i) {
2946
3076
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
3077
  d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
2948
3078
  }
2949
3079
 
2950
- return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
3080
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
2951
3081
  }
2952
3082
 
2953
- static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3083
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2954
3084
 
2955
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
3085
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3086
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3087
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2959
3088
 
2960
3089
  *x_ql = tile_x_ql;
2961
3090
  *x_dm = tile_x_dm;
2962
- *x_qh = tile_x_qh;
2963
3091
  *x_sc = tile_x_sc;
2964
3092
  }
2965
3093
 
2966
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3094
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
3095
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
3096
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2969
3097
 
2970
3098
  __builtin_assume(i_offset >= 0);
2971
- __builtin_assume(i_offset < 8);
3099
+ __builtin_assume(i_offset < nwarps);
2972
3100
  __builtin_assume(k >= 0);
2973
3101
  __builtin_assume(k < WARP_SIZE);
2974
3102
 
@@ -2978,7 +3106,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2978
3106
  const block_q6_K * bx0 = (block_q6_K *) vx;
2979
3107
 
2980
3108
  #pragma unroll
2981
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2982
3110
  int i = i0 + i_offset;
2983
3111
 
2984
3112
  if (need_check) {
@@ -2986,42 +3114,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2986
3114
  }
2987
3115
 
2988
3116
  const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3117
+ const int ky = QR6_K*kqsx;
2989
3118
 
2990
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
- }
2992
-
2993
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2995
-
2996
- #pragma unroll
2997
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3119
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3120
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3121
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2999
3122
 
3000
- if (need_check) {
3001
- i = min(i, i_max);
3002
- }
3123
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3124
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3125
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3003
3126
 
3004
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3127
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3128
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3005
3129
 
3006
- x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3130
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3131
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3007
3132
  }
3008
3133
 
3134
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3135
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3136
+ float * x_dmf = (float *) x_dm;
3137
+
3009
3138
  #pragma unroll
3010
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
3139
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3140
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3012
3141
 
3013
3142
  if (need_check) {
3014
3143
  i = min(i, i_max);
3015
3144
  }
3016
3145
 
3017
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
3146
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3018
3147
 
3019
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3148
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3020
3149
  }
3021
3150
 
3022
3151
  #pragma unroll
3023
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3152
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3153
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3025
3154
 
3026
3155
  if (need_check) {
3027
3156
  i = min(i, i_max);
@@ -3037,41 +3166,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
3166
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
3167
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3039
3168
 
3040
- __builtin_assume(i >= 0);
3041
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
- __builtin_assume(j >= 0);
3043
- __builtin_assume(j < WARP_SIZE);
3044
- __builtin_assume(k >= 0);
3045
- __builtin_assume(k < WARP_SIZE);
3046
-
3047
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
- const int kqsx = k % QI6_K; // == k if QK_K == 256
3049
-
3050
- const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
- const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
- const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
3169
+ const float * x_dmf = (const float *) x_dm;
3170
+ const float * y_df = (const float *) y_ds;
3053
3171
 
3054
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
3172
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3055
3173
 
3056
- const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
- const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
3058
-
3059
- int u[QR6_K];
3060
- float d8[QR6_K];
3061
-
3062
- for (int l = 0; l < QR6_K; ++l) {
3063
- const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
- u[l] = y_qs[kqsy];
3065
- d8[l] = y_ds[kqsy / QI8_1].x;
3066
- }
3067
-
3068
- return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
- x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3174
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3175
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3176
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3070
3177
  }
3071
3178
 
3072
- template <int qk, int qr, int qi, typename block_q_t,
3179
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3073
3180
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
- static __global__ void mul_mat_q(
3181
+ static __device__ __forceinline__ void mul_mat_q(
3075
3182
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3076
3183
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3077
3184
 
@@ -3084,14 +3191,10 @@ static __global__ void mul_mat_q(
3084
3191
 
3085
3192
  const int & ncols_dst = ncols_y;
3086
3193
 
3087
- const int tid_x = threadIdx.x;
3088
- const int tid_y = threadIdx.y;
3089
-
3090
- const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3194
+ const int row_dst_0 = blockIdx.x*mmq_y;
3091
3195
  const int & row_x_0 = row_dst_0;
3092
- const int row_dst = row_dst_0 + tid_x;
3093
3196
 
3094
- const int col_dst_0 = blockIdx.y*WARP_SIZE;
3197
+ const int col_dst_0 = blockIdx.y*mmq_x;
3095
3198
  const int & col_y_0 = col_dst_0;
3096
3199
 
3097
3200
  int * tile_x_ql = nullptr;
@@ -3101,75 +3204,444 @@ static __global__ void mul_mat_q(
3101
3204
 
3102
3205
  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3103
3206
 
3104
- const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
3105
-
3106
- __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
- __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
3207
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3208
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3108
3209
 
3109
- float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
3210
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3110
3211
 
3111
3212
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3112
3213
 
3113
3214
  load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
- tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
3215
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3115
3216
 
3217
+ #pragma unroll
3116
3218
  for (int ir = 0; ir < qr; ++ir) {
3117
- const int kqs = ir*WARP_SIZE + tid_x;
3219
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3118
3220
  const int kbxd = kqs / QI8_1;
3119
3221
 
3120
- for (int i = 0; i < WARP_SIZE; i += 8) {
3121
- const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3222
+ #pragma unroll
3223
+ for (int i = 0; i < mmq_x; i += nwarps) {
3224
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3122
3225
 
3123
3226
  const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
3124
3227
 
3125
- tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3228
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3229
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3126
3230
  }
3127
- }
3128
3231
 
3129
- for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
- const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
- const int kby = tid_x % blocks_per_tile_y_col;
3132
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
- tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
- }
3232
+ #pragma unroll
3233
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3234
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3235
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3236
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3237
+
3238
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3239
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3240
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3241
+ if (need_sum) {
3242
+ *dsi_dst = *dsi_src;
3243
+ } else {
3244
+ float * dfi_dst = (float *) dsi_dst;
3245
+ *dfi_dst = (*dsi_src).x;
3246
+ }
3247
+ }
3135
3248
 
3136
- __syncthreads();
3249
+ __syncthreads();
3137
3250
 
3138
- #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
- #pragma unroll
3140
- #endif // __CUDA_ARCH__ >= 700
3141
- for (int k = 0; k < WARP_SIZE; k += vdr) {
3251
+ // #pragma unroll // unrolling this loop causes too much register pressure
3252
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3142
3253
  #pragma unroll
3143
- for (int j = 0; j < WARP_SIZE; j += 8) {
3254
+ for (int j = 0; j < mmq_x; j += nwarps) {
3144
3255
  #pragma unroll
3145
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
- sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
- tid_x + i, tid_y + j, k);
3256
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3257
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3258
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3259
+ threadIdx.x + i, threadIdx.y + j, k);
3260
+ }
3148
3261
  }
3149
3262
  }
3150
- }
3151
-
3152
- __syncthreads();
3153
- }
3154
3263
 
3155
-
3156
- if (row_dst >= nrows_dst) {
3157
- return;
3264
+ __syncthreads();
3265
+ }
3158
3266
  }
3159
3267
 
3160
- for (int j = 0; j < WARP_SIZE; j += 8) {
3161
- const int col_dst = col_dst_0 + j + tid_y;
3268
+ #pragma unroll
3269
+ for (int j = 0; j < mmq_x; j += nwarps) {
3270
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3162
3271
 
3163
3272
  if (col_dst >= ncols_dst) {
3164
3273
  return;
3165
3274
  }
3166
3275
 
3167
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3276
+ #pragma unroll
3277
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3278
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3279
+
3280
+ if (row_dst >= nrows_dst) {
3281
+ continue;
3282
+ }
3283
+
3284
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3169
3285
  }
3170
3286
  }
3171
3287
  }
3172
3288
 
3289
+ #define MMQ_X_Q4_0_AMPERE 64
3290
+ #define MMQ_Y_Q4_0_AMPERE 128
3291
+ #define NWARPS_Q4_0_AMPERE 4
3292
+ #define MMQ_X_Q4_0_PASCAL 64
3293
+ #define MMQ_Y_Q4_0_PASCAL 64
3294
+ #define NWARPS_Q4_0_PASCAL 8
3295
+
3296
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3297
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3298
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3299
+
3300
+ #if __CUDA_ARCH__ >= CC_TURING
3301
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3302
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3303
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3304
+
3305
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3306
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3307
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3308
+
3309
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3310
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3311
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3312
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3313
+
3314
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3315
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3316
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3317
+ #else
3318
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3319
+ assert(false);
3320
+ #endif // __CUDA_ARCH__ >= CC_TURING
3321
+ }
3322
+
3323
+ #define MMQ_X_Q4_1_AMPERE 64
3324
+ #define MMQ_Y_Q4_1_AMPERE 128
3325
+ #define NWARPS_Q4_1_AMPERE 4
3326
+ #define MMQ_X_Q4_1_PASCAL 64
3327
+ #define MMQ_Y_Q4_1_PASCAL 64
3328
+ #define NWARPS_Q4_1_PASCAL 8
3329
+
3330
+ template <bool need_check> static __global__ void
3331
+ #if __CUDA_ARCH__ < CC_TURING
3332
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3333
+ #endif // __CUDA_ARCH__ < CC_TURING
3334
+ mul_mat_q4_1(
3335
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3336
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3337
+
3338
+ #if __CUDA_ARCH__ >= CC_TURING
3339
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3340
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3341
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3342
+
3343
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3344
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3345
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3346
+
3347
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3348
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3349
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3350
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3351
+
3352
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3353
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3354
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3355
+ #else
3356
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3357
+ assert(false);
3358
+ #endif // __CUDA_ARCH__ >= CC_TURING
3359
+ }
3360
+
3361
+ #define MMQ_X_Q5_0_AMPERE 128
3362
+ #define MMQ_Y_Q5_0_AMPERE 64
3363
+ #define NWARPS_Q5_0_AMPERE 4
3364
+ #define MMQ_X_Q5_0_PASCAL 64
3365
+ #define MMQ_Y_Q5_0_PASCAL 64
3366
+ #define NWARPS_Q5_0_PASCAL 8
3367
+
3368
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3369
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3370
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3371
+
3372
+ #if __CUDA_ARCH__ >= CC_TURING
3373
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3374
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3375
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3376
+
3377
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3378
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3379
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3380
+
3381
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3382
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3383
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3384
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3385
+
3386
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3387
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3388
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3389
+ #else
3390
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3391
+ assert(false);
3392
+ #endif // __CUDA_ARCH__ >= CC_TURING
3393
+ }
3394
+
3395
+ #define MMQ_X_Q5_1_AMPERE 128
3396
+ #define MMQ_Y_Q5_1_AMPERE 64
3397
+ #define NWARPS_Q5_1_AMPERE 4
3398
+ #define MMQ_X_Q5_1_PASCAL 64
3399
+ #define MMQ_Y_Q5_1_PASCAL 64
3400
+ #define NWARPS_Q5_1_PASCAL 8
3401
+
3402
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3403
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3404
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3405
+
3406
+ #if __CUDA_ARCH__ >= CC_TURING
3407
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3408
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3409
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3410
+
3411
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3412
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3413
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3414
+
3415
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3416
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3417
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3418
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3419
+
3420
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3421
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3422
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3423
+ #else
3424
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3425
+ assert(false);
3426
+ #endif // __CUDA_ARCH__ >= CC_TURING
3427
+ }
3428
+
3429
+ #define MMQ_X_Q8_0_AMPERE 128
3430
+ #define MMQ_Y_Q8_0_AMPERE 64
3431
+ #define NWARPS_Q8_0_AMPERE 4
3432
+ #define MMQ_X_Q8_0_PASCAL 64
3433
+ #define MMQ_Y_Q8_0_PASCAL 64
3434
+ #define NWARPS_Q8_0_PASCAL 8
3435
+
3436
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3443
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3444
+
3445
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3446
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3452
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3453
+
3454
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3455
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q2_K_AMPERE 64
3464
+ #define MMQ_Y_Q2_K_AMPERE 128
3465
+ #define NWARPS_Q2_K_AMPERE 4
3466
+ #define MMQ_X_Q2_K_PASCAL 64
3467
+ #define MMQ_Y_Q2_K_PASCAL 64
3468
+ #define NWARPS_Q2_K_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3477
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3478
+
3479
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3480
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3486
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3487
+
3488
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3489
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q3_K_AMPERE 128
3498
+ #define MMQ_Y_Q3_K_AMPERE 128
3499
+ #define NWARPS_Q3_K_AMPERE 4
3500
+ #define MMQ_X_Q3_K_PASCAL 64
3501
+ #define MMQ_Y_Q3_K_PASCAL 64
3502
+ #define NWARPS_Q3_K_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void
3505
+ #if __CUDA_ARCH__ < CC_TURING
3506
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3507
+ #endif // __CUDA_ARCH__ < CC_TURING
3508
+ mul_mat_q3_K(
3509
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3510
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3511
+
3512
+ #if __CUDA_ARCH__ >= CC_TURING
3513
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3514
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3515
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3516
+
3517
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3518
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3520
+
3521
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3522
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3523
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3524
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3525
+
3526
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3527
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3528
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3529
+ #else
3530
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3531
+ assert(false);
3532
+ #endif // __CUDA_ARCH__ >= CC_TURING
3533
+ }
3534
+
3535
+ #define MMQ_X_Q4_K_AMPERE 64
3536
+ #define MMQ_Y_Q4_K_AMPERE 128
3537
+ #define NWARPS_Q4_K_AMPERE 4
3538
+ #define MMQ_X_Q4_K_PASCAL 64
3539
+ #define MMQ_Y_Q4_K_PASCAL 64
3540
+ #define NWARPS_Q4_K_PASCAL 8
3541
+
3542
+ template <bool need_check> static __global__ void
3543
+ #if __CUDA_ARCH__ < CC_TURING
3544
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3545
+ #endif // __CUDA_ARCH__ < CC_TURING
3546
+ mul_mat_q4_K(
3547
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3548
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3549
+
3550
+ #if __CUDA_ARCH__ >= CC_TURING
3551
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3552
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3553
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3554
+
3555
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3556
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3557
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3558
+
3559
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3560
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3561
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3562
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3563
+
3564
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3565
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3566
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3567
+ #else
3568
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3569
+ assert(false);
3570
+ #endif // __CUDA_ARCH__ >= CC_TURING
3571
+ }
3572
+
3573
+ #define MMQ_X_Q5_K_AMPERE 64
3574
+ #define MMQ_Y_Q5_K_AMPERE 128
3575
+ #define NWARPS_Q5_K_AMPERE 4
3576
+ #define MMQ_X_Q5_K_PASCAL 64
3577
+ #define MMQ_Y_Q5_K_PASCAL 64
3578
+ #define NWARPS_Q5_K_PASCAL 8
3579
+
3580
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3581
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3583
+
3584
+ #if __CUDA_ARCH__ >= CC_TURING
3585
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3586
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3587
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3588
+
3589
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3590
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3591
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3592
+
3593
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3594
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3595
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3596
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3597
+
3598
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3599
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3600
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3601
+ #else
3602
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3603
+ assert(false);
3604
+ #endif // __CUDA_ARCH__ >= CC_TURING
3605
+ }
3606
+
3607
+ #define MMQ_X_Q6_K_AMPERE 64
3608
+ #define MMQ_Y_Q6_K_AMPERE 64
3609
+ #define NWARPS_Q6_K_AMPERE 4
3610
+ #define MMQ_X_Q6_K_PASCAL 64
3611
+ #define MMQ_Y_Q6_K_PASCAL 64
3612
+ #define NWARPS_Q6_K_PASCAL 8
3613
+
3614
+ template <bool need_check> static __global__ void
3615
+ #if __CUDA_ARCH__ < CC_TURING
3616
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3617
+ #endif // __CUDA_ARCH__ < CC_TURING
3618
+ mul_mat_q6_K(
3619
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3620
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3621
+
3622
+ #if __CUDA_ARCH__ >= CC_TURING
3623
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3624
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3625
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3626
+
3627
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3628
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3629
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3630
+
3631
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3632
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3633
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3634
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3635
+
3636
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3637
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3638
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3639
+ #else
3640
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3641
+ assert(false);
3642
+ #endif // __CUDA_ARCH__ >= CC_TURING
3643
+ }
3644
+
3173
3645
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3174
3646
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3175
3647
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3780,7 +4252,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
3780
4252
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3781
4253
  const dim3 block_nums(1, block_num_y, 1);
3782
4254
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3783
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
4255
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
3784
4256
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3785
4257
  }
3786
4258
 
@@ -3789,7 +4261,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
3789
4261
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3790
4262
  const dim3 block_nums(1, block_num_y, 1);
3791
4263
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3792
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
4264
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
3793
4265
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3794
4266
  }
3795
4267
 
@@ -3798,7 +4270,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
3798
4270
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3799
4271
  const dim3 block_nums(1, block_num_y, 1);
3800
4272
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3801
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
4273
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
3802
4274
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3803
4275
  }
3804
4276
 
@@ -3807,7 +4279,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
3807
4279
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3808
4280
  const dim3 block_nums(1, block_num_y, 1);
3809
4281
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3810
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
4282
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
3811
4283
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3812
4284
  }
3813
4285
 
@@ -3816,7 +4288,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
3816
4288
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3817
4289
  const dim3 block_nums(1, block_num_y, 1);
3818
4290
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3819
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
4291
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
3820
4292
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3821
4293
  }
3822
4294
 
@@ -3867,17 +4339,36 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
4339
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
4340
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
4341
 
3870
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4342
+ int id;
4343
+ CUDA_CHECK(cudaGetDevice(&id));
4344
+ const int compute_capability = g_compute_capabilities[id];
4345
+
4346
+ int mmq_x, mmq_y, nwarps;
4347
+ if (compute_capability >= CC_TURING) {
4348
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4349
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4350
+ nwarps = NWARPS_Q4_0_AMPERE;
4351
+ } else if (compute_capability >= MIN_CC_DP4A) {
4352
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4353
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4354
+ nwarps = NWARPS_Q4_0_PASCAL;
4355
+ } else {
4356
+ GGML_ASSERT(false);
4357
+ }
4358
+
4359
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4360
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3872
4361
  const dim3 block_nums(block_num_x, block_num_y, 1);
3873
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4362
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3874
4363
 
3875
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4364
+ if (nrows_x % mmq_y == 0) {
4365
+ const bool need_check = false;
4366
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4367
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
4368
  } else {
3879
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4369
+ const bool need_check = true;
4370
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4371
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3881
4372
  }
3882
4373
  }
3883
4374
 
@@ -3885,17 +4376,36 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
4376
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
4377
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
4378
 
3888
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4379
+ int id;
4380
+ CUDA_CHECK(cudaGetDevice(&id));
4381
+ const int compute_capability = g_compute_capabilities[id];
4382
+
4383
+ int mmq_x, mmq_y, nwarps;
4384
+ if (compute_capability >= CC_TURING) {
4385
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4386
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4387
+ nwarps = NWARPS_Q4_1_AMPERE;
4388
+ } else if (compute_capability >= MIN_CC_DP4A) {
4389
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4390
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4391
+ nwarps = NWARPS_Q4_1_PASCAL;
4392
+ } else {
4393
+ GGML_ASSERT(false);
4394
+ }
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3890
4398
  const dim3 block_nums(block_num_x, block_num_y, 1);
3891
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3892
4400
 
3893
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4404
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3896
4405
  } else {
3897
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ const bool need_check = true;
4407
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4408
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3899
4409
  }
3900
4410
  }
3901
4411
 
@@ -3903,17 +4413,36 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
4413
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
4414
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
4415
 
3906
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4416
+ int id;
4417
+ CUDA_CHECK(cudaGetDevice(&id));
4418
+ const int compute_capability = g_compute_capabilities[id];
4419
+
4420
+ int mmq_x, mmq_y, nwarps;
4421
+ if (compute_capability >= CC_TURING) {
4422
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4423
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4424
+ nwarps = NWARPS_Q5_0_AMPERE;
4425
+ } else if (compute_capability >= MIN_CC_DP4A) {
4426
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4427
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4428
+ nwarps = NWARPS_Q5_0_PASCAL;
4429
+ } else {
4430
+ GGML_ASSERT(false);
4431
+ }
4432
+
4433
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4434
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3908
4435
  const dim3 block_nums(block_num_x, block_num_y, 1);
3909
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4436
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3910
4437
 
3911
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ if (nrows_x % mmq_y == 0) {
4439
+ const bool need_check = false;
4440
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4441
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3914
4442
  } else {
3915
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ const bool need_check = true;
4444
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4445
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3917
4446
  }
3918
4447
  }
3919
4448
 
@@ -3921,17 +4450,36 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
4450
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
4451
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
4452
 
3924
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4453
+ int id;
4454
+ CUDA_CHECK(cudaGetDevice(&id));
4455
+ const int compute_capability = g_compute_capabilities[id];
4456
+
4457
+ int mmq_x, mmq_y, nwarps;
4458
+ if (compute_capability >= CC_TURING) {
4459
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4460
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4461
+ nwarps = NWARPS_Q5_1_AMPERE;
4462
+ } else if (compute_capability >= MIN_CC_DP4A) {
4463
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4464
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4465
+ nwarps = NWARPS_Q5_1_PASCAL;
4466
+ } else {
4467
+ GGML_ASSERT(false);
4468
+ }
4469
+
4470
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4471
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3926
4472
  const dim3 block_nums(block_num_x, block_num_y, 1);
3927
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4473
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3928
4474
 
3929
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4475
+ if (nrows_x % mmq_y == 0) {
4476
+ const bool need_check = false;
4477
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4478
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3932
4479
  } else {
3933
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4480
+ const bool need_check = true;
4481
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4482
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3935
4483
  }
3936
4484
  }
3937
4485
 
@@ -3939,17 +4487,36 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
4487
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
4488
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
4489
 
3942
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4490
+ int id;
4491
+ CUDA_CHECK(cudaGetDevice(&id));
4492
+ const int compute_capability = g_compute_capabilities[id];
4493
+
4494
+ int mmq_x, mmq_y, nwarps;
4495
+ if (compute_capability >= CC_TURING) {
4496
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4498
+ nwarps = NWARPS_Q8_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4502
+ nwarps = NWARPS_Q8_0_PASCAL;
4503
+ } else {
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3944
4509
  const dim3 block_nums(block_num_x, block_num_y, 1);
3945
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3946
4511
 
3947
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3950
4516
  } else {
3951
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4517
+ const bool need_check = true;
4518
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
4520
  }
3954
4521
  }
3955
4522
 
@@ -3957,17 +4524,36 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
4524
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
4525
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
4526
 
3960
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4527
+ int id;
4528
+ CUDA_CHECK(cudaGetDevice(&id));
4529
+ const int compute_capability = g_compute_capabilities[id];
4530
+
4531
+ int mmq_x, mmq_y, nwarps;
4532
+ if (compute_capability >= CC_TURING) {
4533
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4534
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4535
+ nwarps = NWARPS_Q2_K_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4538
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4539
+ nwarps = NWARPS_Q2_K_PASCAL;
4540
+ } else {
4541
+ GGML_ASSERT(false);
4542
+ }
4543
+
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3962
4546
  const dim3 block_nums(block_num_x, block_num_y, 1);
3963
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3964
4548
 
3965
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
4553
  } else {
3969
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4554
+ const bool need_check = true;
4555
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3971
4557
  }
3972
4558
  }
3973
4559
 
@@ -3975,17 +4561,36 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
4561
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
4562
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
4563
 
3978
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4564
+ int id;
4565
+ CUDA_CHECK(cudaGetDevice(&id));
4566
+ const int compute_capability = g_compute_capabilities[id];
4567
+
4568
+ int mmq_x, mmq_y, nwarps;
4569
+ if (compute_capability >= CC_TURING) {
4570
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4571
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4572
+ nwarps = NWARPS_Q3_K_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4575
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4576
+ nwarps = NWARPS_Q3_K_PASCAL;
4577
+ } else {
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3980
4583
  const dim3 block_nums(block_num_x, block_num_y, 1);
3981
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3982
4585
 
3983
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
4590
  } else {
3987
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4591
+ const bool need_check = true;
4592
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3989
4594
  }
3990
4595
  }
3991
4596
 
@@ -3993,17 +4598,36 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
4598
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
4599
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
4600
 
3996
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4601
+ int id;
4602
+ CUDA_CHECK(cudaGetDevice(&id));
4603
+ const int compute_capability = g_compute_capabilities[id];
4604
+
4605
+ int mmq_x, mmq_y, nwarps;
4606
+ if (compute_capability >= CC_TURING) {
4607
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4608
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4609
+ nwarps = NWARPS_Q4_K_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4612
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4613
+ nwarps = NWARPS_Q4_K_PASCAL;
4614
+ } else {
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3998
4620
  const dim3 block_nums(block_num_x, block_num_y, 1);
3999
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4000
4622
 
4001
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4004
4627
  } else {
4005
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4628
+ const bool need_check = true;
4629
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4007
4631
  }
4008
4632
  }
4009
4633
 
@@ -4011,17 +4635,36 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
4635
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
4636
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
4637
 
4014
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4638
+ int id;
4639
+ CUDA_CHECK(cudaGetDevice(&id));
4640
+ const int compute_capability = g_compute_capabilities[id];
4641
+
4642
+ int mmq_x, mmq_y, nwarps;
4643
+ if (compute_capability >= CC_TURING) {
4644
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4645
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4646
+ nwarps = NWARPS_Q5_K_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4649
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4650
+ nwarps = NWARPS_Q5_K_PASCAL;
4651
+ } else {
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4016
4657
  const dim3 block_nums(block_num_x, block_num_y, 1);
4017
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4018
4659
 
4019
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
4664
  } else {
4023
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4665
+ const bool need_check = true;
4666
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4025
4668
  }
4026
4669
  }
4027
4670
 
@@ -4029,17 +4672,36 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
4672
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
4673
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
4674
 
4032
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4675
+ int id;
4676
+ CUDA_CHECK(cudaGetDevice(&id));
4677
+ const int compute_capability = g_compute_capabilities[id];
4678
+
4679
+ int mmq_x, mmq_y, nwarps;
4680
+ if (compute_capability >= CC_TURING) {
4681
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4683
+ nwarps = NWARPS_Q6_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4687
+ nwarps = NWARPS_Q6_K_PASCAL;
4688
+ } else {
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4034
4694
  const dim3 block_nums(block_num_x, block_num_y, 1);
4035
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4036
4696
 
4037
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4040
4701
  } else {
4041
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4702
+ const bool need_check = true;
4703
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4043
4705
  }
4044
4706
  }
4045
4707
 
@@ -4214,20 +4876,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
4214
4876
  }
4215
4877
 
4216
4878
 
4217
- static void * g_scratch_buffer = nullptr;
4218
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
4219
- static size_t g_scratch_offset = 0;
4220
-
4221
- static int g_device_count = -1;
4222
- static int g_main_device = 0;
4223
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
4224
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
- static bool g_mul_mat_q = false;
4226
-
4227
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
4228
-
4229
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
4230
-
4231
4879
  void ggml_init_cublas() {
4232
4880
  static bool initialized = false;
4233
4881
 
@@ -4583,6 +5231,37 @@ inline void ggml_cuda_op_mul_mat_q(
4583
5231
  (void) i1;
4584
5232
  }
4585
5233
 
5234
+ static int64_t get_row_rounding(ggml_type type) {
5235
+ int max_compute_capability = INT_MIN;
5236
+ for (int id = 0; id < g_device_count; ++id) {
5237
+ if (max_compute_capability < g_compute_capabilities[id]
5238
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5239
+ max_compute_capability = g_compute_capabilities[id];
5240
+ }
5241
+ }
5242
+
5243
+ switch(type) {
5244
+ case GGML_TYPE_Q4_0:
5245
+ case GGML_TYPE_Q4_1:
5246
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5247
+ case GGML_TYPE_Q5_0:
5248
+ case GGML_TYPE_Q5_1:
5249
+ case GGML_TYPE_Q8_0:
5250
+ return 64;
5251
+ case GGML_TYPE_F16:
5252
+ return 1;
5253
+ case GGML_TYPE_Q2_K:
5254
+ case GGML_TYPE_Q3_K:
5255
+ case GGML_TYPE_Q4_K:
5256
+ case GGML_TYPE_Q5_K:
5257
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5258
+ case GGML_TYPE_Q6_K:
5259
+ return 64;
5260
+ default:
5261
+ GGML_ASSERT(false);
5262
+ }
5263
+ }
5264
+
4586
5265
  inline void ggml_cuda_op_mul_mat_vec(
4587
5266
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4588
5267
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -4983,14 +5662,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
4983
5662
 
4984
5663
  int64_t row_low, row_high;
4985
5664
  if (split) {
5665
+ const int64_t rounding = get_row_rounding(src0->type);
5666
+
4986
5667
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
4987
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5668
+ row_low -= row_low % rounding;
4988
5669
 
4989
5670
  if (id == g_device_count - 1) {
4990
5671
  row_high = nrows0;
4991
5672
  } else {
4992
5673
  row_high = nrows0*g_tensor_split[id + 1];
4993
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5674
+ row_high -= row_high % rounding;
4994
5675
  }
4995
5676
  } else {
4996
5677
  row_low = 0;
@@ -5203,7 +5884,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
5203
5884
  if (split && g_device_count > 1) {
5204
5885
  CUDA_CHECK(cudaSetDevice(g_main_device));
5205
5886
  for (int id = 0; id < g_device_count; ++id) {
5206
- if (id != g_main_device) {
5887
+ if (id != g_main_device && src0_extra->events[id]) {
5207
5888
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
5208
5889
  }
5209
5890
  }
@@ -5347,7 +6028,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
5347
6028
  } else {
5348
6029
  int min_compute_capability = INT_MAX;
5349
6030
  for (int id = 0; id < g_device_count; ++id) {
5350
- if (min_compute_capability > g_compute_capabilities[id]) {
6031
+ if (min_compute_capability > g_compute_capabilities[id]
6032
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5351
6033
  min_compute_capability = g_compute_capabilities[id];
5352
6034
  }
5353
6035
  }
@@ -5468,14 +6150,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
5468
6150
  row_low = 0;
5469
6151
  row_high = nrows;
5470
6152
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
6153
+ const int64_t rounding = get_row_rounding(tensor->type);
6154
+
5471
6155
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
5472
- row_low -= row_low % GGML_CUDA_MMQ_Y;
6156
+ row_low -= row_low % rounding;
5473
6157
 
5474
6158
  if (id == g_device_count - 1) {
5475
6159
  row_high = nrows;
5476
6160
  } else {
5477
6161
  row_high = nrows*g_tensor_split[id + 1];
5478
- row_high -= row_high % GGML_CUDA_MMQ_Y;
6162
+ row_high -= row_high % rounding;
5479
6163
  }
5480
6164
  } else {
5481
6165
  GGML_ASSERT(false);
@@ -5785,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
5785
6469
  func(tensor->src[0], tensor->src[1], tensor);
5786
6470
  return true;
5787
6471
  }
6472
+
6473
+ int ggml_cuda_get_device_count() {
6474
+ int device_count;
6475
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6476
+ return device_count;
6477
+ }
6478
+
6479
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6480
+ cudaDeviceProp prop;
6481
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6482
+ snprintf(description, description_size, "%s", prop.name);
6483
+ }