llama_cpp 0.3.6 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,7 @@
14
14
  #include "ggml.h"
15
15
 
16
16
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
17
+ #define CC_TURING 700
17
18
 
18
19
  #if defined(_MSC_VER)
19
20
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
262
263
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
263
264
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
264
265
 
265
- #ifndef GGML_CUDA_MMQ_Y
266
- #define GGML_CUDA_MMQ_Y 64
267
- #endif // GGML_CUDA_MMQ_Y
268
-
269
266
  // dmmv = dequantize_mul_mat_vec
270
267
  #ifndef GGML_CUDA_DMMV_X
271
268
  #define GGML_CUDA_DMMV_X 32
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
285
282
  cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
286
283
  };
287
284
 
285
+ static int g_device_count = -1;
286
+ static int g_main_device = 0;
287
+ static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
+ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
+ static bool g_mul_mat_q = false;
290
+
291
+ static void * g_scratch_buffer = nullptr;
292
+ static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
293
+ static size_t g_scratch_offset = 0;
294
+
295
+ static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
296
+
297
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
298
+
288
299
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
289
300
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
290
301
 
@@ -1383,9 +1394,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1383
1394
  sumi = __dp4a(vi1, u[2*i+1], sumi);
1384
1395
  }
1385
1396
 
1397
+ const float2 ds8f = __half22float2(ds8);
1398
+
1386
1399
  // second part effectively subtracts 8 from each quant value
1387
- return d4 * (sumi * __half2float(ds8.x) - (8*vdr/QI4_0) * __half2float(ds8.y));
1400
+ return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1388
1401
  #else
1402
+ assert(false);
1389
1403
  return 0.0f; // only to satisfy the compiler
1390
1404
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1391
1405
  }
@@ -1410,17 +1424,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1410
1424
  }
1411
1425
 
1412
1426
  #ifdef GGML_CUDA_F16
1413
- const half2 tmp = __hmul2(dm4, ds8);
1414
- const float d4d8 = __half2float(tmp.x);
1415
- const float m4s8 = __half2float(tmp.y);
1427
+ const float2 tmp = __half22float2(__hmul2(dm4, ds8));
1428
+ const float d4d8 = tmp.x;
1429
+ const float m4s8 = tmp.y;
1416
1430
  #else
1417
- const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
1418
- const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
1431
+ const float2 dm4f = __half22float2(dm4);
1432
+ const float2 ds8f = __half22float2(ds8);
1433
+ const float d4d8 = dm4f.x * ds8f.x;
1434
+ const float m4s8 = dm4f.y * ds8f.y;
1419
1435
  #endif // GGML_CUDA_F16
1420
1436
 
1421
1437
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1422
1438
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1423
1439
  #else
1440
+ assert(false);
1424
1441
  return 0.0f; // only to satisfy the compiler
1425
1442
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1426
1443
  }
@@ -1434,6 +1451,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1434
1451
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1435
1452
  int sumi = 0;
1436
1453
 
1454
+ #pragma unroll
1437
1455
  for (int i = 0; i < vdr; ++i) {
1438
1456
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1439
1457
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1450,9 +1468,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1450
1468
  sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
1451
1469
  }
1452
1470
 
1471
+ const float2 ds8f = __half22float2(ds8);
1472
+
1453
1473
  // second part effectively subtracts 16 from each quant value
1454
- return d5 * (sumi*__half2float(ds8.x) - (16*vdr/QI5_0) * __half2float(ds8.y));
1474
+ return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1455
1475
  #else
1476
+ assert(false);
1456
1477
  return 0.0f; // only to satisfy the compiler
1457
1478
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1458
1479
  }
@@ -1466,6 +1487,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1466
1487
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1467
1488
  int sumi = 0;
1468
1489
 
1490
+ #pragma unroll
1469
1491
  for (int i = 0; i < vdr; ++i) {
1470
1492
  int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
1471
1493
  vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
@@ -1483,18 +1505,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1483
1505
  }
1484
1506
 
1485
1507
  #ifdef GGML_CUDA_F16
1486
- const half2 tmp = __hmul2(dm5, ds8);
1487
- const float d5d8 = __half2float(tmp.x);
1488
- const float m5s8 = __half2float(tmp.y);
1508
+ const float2 tmp = __half22float2(__hmul2(dm5, ds8));
1509
+ const float d5d8 = tmp.x;
1510
+ const float m5s8 = tmp.y;
1489
1511
  #else
1490
- const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
1491
- const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
1512
+ const float2 dm5f = __half22float2(dm5);
1513
+ const float2 ds8f = __half22float2(ds8);
1514
+ const float d5d8 = dm5f.x * ds8f.x;
1515
+ const float m5s8 = dm5f.y * ds8f.y;
1492
1516
  #endif // GGML_CUDA_F16
1493
1517
 
1494
1518
  // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
1495
1519
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1496
1520
 
1497
1521
  #else
1522
+ assert(false);
1498
1523
  return 0.0f; // only to satisfy the compiler
1499
1524
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1500
1525
  }
@@ -1503,18 +1528,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1503
1528
  #define VDR_Q8_0_Q8_1_MMQ 8
1504
1529
 
1505
1530
  template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
1506
- const int * v, const int * u, const float & d8_0, const half2 & ds8_1) {
1531
+ const int * v, const int * u, const float & d8_0, const float & d8_1) {
1507
1532
 
1508
1533
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1509
1534
  int sumi = 0;
1510
1535
 
1536
+ #pragma unroll
1511
1537
  for (int i = 0; i < vdr; ++i) {
1512
1538
  // SIMD dot product of quantized values
1513
1539
  sumi = __dp4a(v[i], u[i], sumi);
1514
1540
  }
1515
1541
 
1516
- return sumi * d8_0 * __half2float(ds8_1.x);
1542
+ return d8_0*d8_1 * sumi;
1517
1543
  #else
1544
+ assert(false);
1518
1545
  return 0.0f; // only to satisfy the compiler
1519
1546
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1520
1547
  }
@@ -1525,23 +1552,374 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1525
1552
  #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1526
1553
  int sumi = 0;
1527
1554
 
1555
+ #pragma unroll
1528
1556
  for (int i = 0; i < vdr; ++i) {
1529
1557
  // SIMD dot product of quantized values
1530
1558
  sumi = __dp4a(v[i], u[i], sumi);
1531
1559
  }
1532
1560
 
1533
1561
  #ifdef GGML_CUDA_F16
1534
- const half2 tmp = __hmul2(dm8, ds8);
1535
- const float d8d8 = __half2float(tmp.x);
1536
- const float m8s8 = __half2float(tmp.y);
1562
+ const float2 tmp = __half22float2(__hmul2(dm8, ds8));
1563
+ const float d8d8 = tmp.x;
1564
+ const float m8s8 = tmp.y;
1537
1565
  #else
1538
- const float d8d8 = __half2float(dm8.x) * __half2float(ds8.x);
1539
- const float m8s8 = __half2float(dm8.y) * __half2float(ds8.y);
1566
+ const float2 dm8f = __half22float2(dm8);
1567
+ const float2 ds8f = __half22float2(ds8);
1568
+ const float d8d8 = dm8f.x * ds8f.x;
1569
+ const float m8s8 = dm8f.y * ds8f.y;
1540
1570
  #endif // GGML_CUDA_F16
1541
1571
 
1542
1572
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1543
1573
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1544
1574
  #else
1575
+ assert(false);
1576
+ return 0.0f; // only to satisfy the compiler
1577
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1578
+ }
1579
+
1580
+ #define VDR_Q2_K_Q8_1_MMVQ 1
1581
+ #define VDR_Q2_K_Q8_1_MMQ 2
1582
+
1583
+ // contiguous v/x values
1584
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1585
+ const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1586
+ const half2 & dm2, const float * __restrict__ d8) {
1587
+
1588
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1589
+ float sumf_d = 0.0f;
1590
+ float sumf_m = 0.0f;
1591
+
1592
+ #pragma unroll
1593
+ for (int i = 0; i < QR2_K; ++i) {
1594
+ const int sc = scales[2*i];
1595
+
1596
+ const int vi = (v >> (2*i)) & 0x03030303;
1597
+
1598
+ sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
1599
+
1600
+ // fill int with 4x m
1601
+ int m = sc >> 4;
1602
+ m |= m << 8;
1603
+ m |= m << 16;
1604
+ sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
1605
+ }
1606
+
1607
+ const float2 dm2f = __half22float2(dm2);
1608
+
1609
+ return dm2f.x*sumf_d - dm2f.y*sumf_m;
1610
+ #else
1611
+ assert(false);
1612
+ return 0.0f; // only to satisfy the compiler
1613
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1614
+ }
1615
+
1616
+ // contiguous u/y values
1617
+ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1618
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1619
+ const half2 & dm2, const float & d8) {
1620
+
1621
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1622
+ int sumi_d = 0;
1623
+ int sumi_m = 0;
1624
+
1625
+ #pragma unroll
1626
+ for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
1627
+ int sumi_d_sc = 0;
1628
+
1629
+ const int sc = scales[i0 / (QI8_1/2)];
1630
+
1631
+ // fill int with 4x m
1632
+ int m = sc >> 4;
1633
+ m |= m << 8;
1634
+ m |= m << 16;
1635
+
1636
+ #pragma unroll
1637
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1638
+ sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
1639
+ sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
1640
+ }
1641
+
1642
+ sumi_d += sumi_d_sc * (sc & 0xF);
1643
+ }
1644
+
1645
+ const float2 dm2f = __half22float2(dm2);
1646
+
1647
+ return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1648
+ #else
1649
+ assert(false);
1650
+ return 0.0f; // only to satisfy the compiler
1651
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1652
+ }
1653
+
1654
+ #define VDR_Q3_K_Q8_1_MMVQ 1
1655
+ #define VDR_Q3_K_Q8_1_MMQ 2
1656
+
1657
+ // contiguous v/x values
1658
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1659
+ const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
1660
+ const int & scale_offset, const float & d3, const float * __restrict__ d8) {
1661
+
1662
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1663
+ float sumf = 0.0f;
1664
+
1665
+ #pragma unroll
1666
+ for (int i = 0; i < QR3_K; ++i) {
1667
+ const int isc = scale_offset + 2*i;
1668
+
1669
+ const int isc_low = isc % (QK_K/32);
1670
+ const int sc_shift_low = 4 * (isc / (QK_K/32));
1671
+ const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
1672
+
1673
+ const int isc_high = isc % (QK_K/64);
1674
+ const int sc_shift_high = 2 * (isc / (QK_K/64));
1675
+ const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
1676
+
1677
+ const int sc = (sc_low | sc_high) - 32;
1678
+
1679
+ const int vil = (vl >> (2*i)) & 0x03030303;
1680
+
1681
+ const int vih = ((vh >> i) << 2) & 0x04040404;
1682
+
1683
+ const int vi = __vsubss4(vil, vih);
1684
+
1685
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1686
+ }
1687
+
1688
+ return d3 * sumf;
1689
+ #else
1690
+ assert(false);
1691
+ return 0.0f; // only to satisfy the compiler
1692
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1693
+ }
1694
+
1695
+ // contiguous u/y values
1696
+ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1697
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
1698
+ const float & d3, const float & d8) {
1699
+
1700
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1701
+ int sumi = 0;
1702
+
1703
+ #pragma unroll
1704
+ for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
1705
+ int sumi_sc = 0;
1706
+
1707
+ for (int i = i0; i < i0 + QI8_1/2; ++i) {
1708
+ sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
1709
+ }
1710
+
1711
+ sumi += sumi_sc * scales[i0 / (QI8_1/2)];
1712
+ }
1713
+
1714
+ return d3*d8 * sumi;
1715
+ #else
1716
+ assert(false);
1717
+ return 0.0f; // only to satisfy the compiler
1718
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1719
+ }
1720
+
1721
+ #define VDR_Q4_K_Q8_1_MMVQ 2
1722
+ #define VDR_Q4_K_Q8_1_MMQ 8
1723
+
1724
+ // contiguous v/x values
1725
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1726
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1727
+ const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
1728
+
1729
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1730
+ float sumf_d = 0.0f;
1731
+ float sumf_m = 0.0f;
1732
+
1733
+ #pragma unroll
1734
+ for (int i = 0; i < QR4_K; ++i) {
1735
+ const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
1736
+ const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
1737
+
1738
+ const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
1739
+ const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
1740
+
1741
+ sumf_d += d8[i] * (dot1 * sc[i]);
1742
+ sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1743
+ }
1744
+
1745
+ const float2 dm4f = __half22float2(dm4);
1746
+
1747
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1748
+
1749
+ #else
1750
+ assert(false);
1751
+ return 0.0f; // only to satisfy the compiler
1752
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1753
+ }
1754
+
1755
+ // contiguous u/y values
1756
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1757
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1758
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1759
+
1760
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1761
+ float sumf_d = 0.0f;
1762
+ float sumf_m = 0.0f;
1763
+
1764
+ #pragma unroll
1765
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1766
+ int sumi_d = 0;
1767
+
1768
+ #pragma unroll
1769
+ for (int j = 0; j < QI8_1; ++j) {
1770
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1771
+ }
1772
+
1773
+ const float2 ds8f = __half22float2(ds8[i]);
1774
+
1775
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1776
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1777
+ }
1778
+
1779
+ const float2 dm4f = __half22float2(dm4);
1780
+
1781
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1782
+
1783
+ #else
1784
+ assert(false);
1785
+ return 0.0f; // only to satisfy the compiler
1786
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1787
+ }
1788
+
1789
+ #define VDR_Q5_K_Q8_1_MMVQ 2
1790
+ #define VDR_Q5_K_Q8_1_MMQ 8
1791
+
1792
+ // contiguous v/x values
1793
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1794
+ const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1795
+ const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1796
+
1797
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1798
+ float sumf_d = 0.0f;
1799
+ float sumf_m = 0.0f;
1800
+
1801
+ #pragma unroll
1802
+ for (int i = 0; i < QR5_K; ++i) {
1803
+ const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
1804
+ const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
1805
+
1806
+ const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
1807
+ const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
1808
+
1809
+ const int v0i = vl0i | vh0i;
1810
+ const int v1i = vl1i | vh1i;
1811
+
1812
+ const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
1813
+ const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
1814
+
1815
+ sumf_d += d8[i] * (dot1 * sc[i]);
1816
+ sumf_m += d8[i] * (dot2 * m[i]);
1817
+
1818
+ }
1819
+
1820
+ const float2 dm5f = __half22float2(dm5);
1821
+
1822
+ return dm5f.x*sumf_d - dm5f.y*sumf_m;
1823
+
1824
+ #else
1825
+ assert(false);
1826
+ return 0.0f; // only to satisfy the compiler
1827
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1828
+ }
1829
+
1830
+ // contiguous u/y values
1831
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1832
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1833
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1834
+
1835
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1836
+ float sumf_d = 0.0f;
1837
+ float sumf_m = 0.0f;
1838
+
1839
+ #pragma unroll
1840
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1841
+ int sumi_d = 0;
1842
+
1843
+ #pragma unroll
1844
+ for (int j = 0; j < QI8_1; ++j) {
1845
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1846
+ }
1847
+
1848
+ const float2 ds8f = __half22float2(ds8[i]);
1849
+
1850
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1851
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1852
+ }
1853
+
1854
+ const float2 dm4f = __half22float2(dm4);
1855
+
1856
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1857
+
1858
+ #else
1859
+ assert(false);
1860
+ return 0.0f; // only to satisfy the compiler
1861
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1862
+ }
1863
+
1864
+ #define VDR_Q6_K_Q8_1_MMVQ 1
1865
+ #define VDR_Q6_K_Q8_1_MMQ 8
1866
+
1867
+ // contiguous v/x values
1868
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1869
+ const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
1870
+ const float & d, const float * __restrict__ d8) {
1871
+
1872
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1873
+ float sumf = 0.0f;
1874
+
1875
+ #pragma unroll
1876
+ for (int i = 0; i < QR6_K; ++i) {
1877
+ const int sc = scales[4*i];
1878
+
1879
+ const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1880
+
1881
+ const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
1882
+
1883
+ const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
1884
+
1885
+ sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
1886
+ }
1887
+
1888
+ return d*sumf;
1889
+ #else
1890
+ assert(false);
1891
+ return 0.0f; // only to satisfy the compiler
1892
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1893
+ }
1894
+
1895
+ // contiguous u/y values
1896
+ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1897
+ const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
1898
+ const float & d6, const float * __restrict__ d8) {
1899
+
1900
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1901
+ float sumf_d = 0.0f;
1902
+
1903
+ #pragma unroll
1904
+ for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
1905
+ int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
1906
+
1907
+ #pragma unroll
1908
+ for (int i = i0; i < i0 + 2; ++i) {
1909
+ sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
1910
+ sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
1911
+
1912
+ sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
1913
+ sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
1914
+ }
1915
+
1916
+ sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
1917
+ }
1918
+
1919
+ return d6 * sumf_d;
1920
+
1921
+ #else
1922
+ assert(false);
1545
1923
  return 0.0f; // only to satisfy the compiler
1546
1924
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1547
1925
  }
@@ -1564,21 +1942,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
1564
1942
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
1565
1943
  }
1566
1944
 
1567
- static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1945
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1568
1946
 
1569
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
1570
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0];
1947
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
1948
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
1571
1949
 
1572
1950
  *x_ql = tile_x_qs;
1573
1951
  *x_dm = (half2 *) tile_x_d;
1574
1952
  }
1575
1953
 
1576
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1954
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
1577
1955
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1578
1956
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1579
1957
 
1580
1958
  __builtin_assume(i_offset >= 0);
1581
- __builtin_assume(i_offset < 8);
1959
+ __builtin_assume(i_offset < nwarps);
1582
1960
  __builtin_assume(k >= 0);
1583
1961
  __builtin_assume(k < WARP_SIZE);
1584
1962
 
@@ -1590,7 +1968,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1590
1968
  float * x_dmf = (float *) x_dm;
1591
1969
 
1592
1970
  #pragma unroll
1593
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
1971
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1594
1972
  int i = i0 + i_offset;
1595
1973
 
1596
1974
  if (need_check) {
@@ -1600,38 +1978,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1600
1978
  const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
1601
1979
 
1602
1980
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
1603
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1981
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
1604
1982
  }
1605
1983
 
1606
- // const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1607
- // const int kbxd = k % blocks_per_tile_x_row;
1984
+ const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
1985
+ const int kbxd = k % blocks_per_tile_x_row;
1608
1986
 
1609
- // #pragma unroll
1610
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) {
1611
- // FIXME out-of-bounds
1612
- // const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1987
+ #pragma unroll
1988
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
1989
+ int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
1613
1990
 
1614
- // if (i >= GGML_CUDA_MMQ_Y) {
1615
- // return;
1616
- // }
1991
+ if (need_check) {
1992
+ i = min(i, i_max);
1993
+ }
1617
1994
 
1618
- // const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1995
+ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
1619
1996
 
1620
- // x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d;
1621
- // }
1997
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
1998
+ }
1622
1999
  }
1623
2000
 
1624
2001
  static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1625
2002
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1626
2003
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1627
2004
 
1628
- __builtin_assume(i >= 0);
1629
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1630
- __builtin_assume(j >= 0);
1631
- __builtin_assume(j < WARP_SIZE);
1632
- __builtin_assume(k >= 0);
1633
- __builtin_assume(k < WARP_SIZE);
1634
-
1635
2005
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1636
2006
  const float * x_dmf = (float *) x_dm;
1637
2007
 
@@ -1639,13 +2009,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
1639
2009
 
1640
2010
  #pragma unroll
1641
2011
  for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
1642
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1643
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0];
2012
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2013
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
1644
2014
  }
1645
2015
 
1646
2016
  return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
1647
2017
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
1648
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2018
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1649
2019
  }
1650
2020
 
1651
2021
  static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
@@ -1666,21 +2036,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
1666
2036
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
1667
2037
  }
1668
2038
 
1669
- static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2039
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1670
2040
 
1671
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y];
1672
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1];
2041
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
2042
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
1673
2043
 
1674
2044
  *x_ql = tile_x_qs;
1675
2045
  *x_dm = tile_x_dm;
1676
2046
  }
1677
2047
 
1678
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
2048
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
1679
2049
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1680
2050
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1681
2051
 
1682
2052
  __builtin_assume(i_offset >= 0);
1683
- __builtin_assume(i_offset < 8);
2053
+ __builtin_assume(i_offset < nwarps);
1684
2054
  __builtin_assume(k >= 0);
1685
2055
  __builtin_assume(k < WARP_SIZE);
1686
2056
 
@@ -1690,7 +2060,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1690
2060
  const block_q4_1 * bx0 = (block_q4_1 *) vx;
1691
2061
 
1692
2062
  #pragma unroll
1693
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2063
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1694
2064
  int i = i0 + i_offset;
1695
2065
 
1696
2066
  if (need_check) {
@@ -1706,7 +2076,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
1706
2076
  const int kbxd = k % blocks_per_tile_x_row;
1707
2077
 
1708
2078
  #pragma unroll
1709
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) {
2079
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
1710
2080
  int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
1711
2081
 
1712
2082
  if (need_check) {
@@ -1723,26 +2093,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
1723
2093
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1724
2094
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1725
2095
 
1726
- __builtin_assume(i >= 0);
1727
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1728
- __builtin_assume(j >= 0);
1729
- __builtin_assume(j < WARP_SIZE);
1730
- __builtin_assume(k >= 0);
1731
- __builtin_assume(k < WARP_SIZE);
1732
-
1733
2096
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1734
2097
 
1735
2098
  int u[2*VDR_Q4_1_Q8_1_MMQ];
1736
2099
 
1737
2100
  #pragma unroll
1738
2101
  for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
1739
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1740
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1];
2102
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2103
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
1741
2104
  }
1742
2105
 
1743
2106
  return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
1744
2107
  (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
1745
- y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2108
+ y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1746
2109
  }
1747
2110
 
1748
2111
  static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
@@ -1765,21 +2128,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
1765
2128
  return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
1766
2129
  }
1767
2130
 
1768
- static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2131
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1769
2132
 
1770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1771
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0];
2133
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2134
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
1772
2135
 
1773
2136
  *x_ql = tile_x_ql;
1774
2137
  *x_dm = (half2 *) tile_x_d;
1775
2138
  }
1776
2139
 
1777
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
2140
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
1778
2141
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1779
2142
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1780
2143
 
1781
2144
  __builtin_assume(i_offset >= 0);
1782
- __builtin_assume(i_offset < 8);
2145
+ __builtin_assume(i_offset < nwarps);
1783
2146
  __builtin_assume(k >= 0);
1784
2147
  __builtin_assume(k < WARP_SIZE);
1785
2148
 
@@ -1789,7 +2152,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1789
2152
  const block_q5_0 * bx0 = (block_q5_0 *) vx;
1790
2153
 
1791
2154
  #pragma unroll
1792
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2155
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1793
2156
  int i = i0 + i_offset;
1794
2157
 
1795
2158
  if (need_check) {
@@ -1825,7 +2188,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1825
2188
  float * x_dmf = (float *) x_dm;
1826
2189
 
1827
2190
  #pragma unroll
1828
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) {
2191
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
1829
2192
  int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
1830
2193
 
1831
2194
  if (need_check) {
@@ -1842,27 +2205,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
1842
2205
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1843
2206
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1844
2207
 
1845
- __builtin_assume(i >= 0);
1846
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1847
- __builtin_assume(j >= 0);
1848
- __builtin_assume(j < WARP_SIZE);
1849
- __builtin_assume(k >= 0);
1850
- __builtin_assume(k < WARP_SIZE);
1851
-
1852
2208
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1853
2209
  const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
1854
- const float * x_dmf = (float *) x_dm;
2210
+ const float * x_dmf = (const float *) x_dm;
2211
+ const float * y_df = (const float *) y_ds;
1855
2212
 
1856
2213
  int u[2*VDR_Q5_0_Q8_1_MMQ];
1857
2214
 
1858
2215
  #pragma unroll
1859
2216
  for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
1860
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1861
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0];
2217
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2218
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
1862
2219
  }
1863
2220
 
1864
2221
  return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
1865
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2222
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1866
2223
  }
1867
2224
 
1868
2225
  static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
@@ -1885,21 +2242,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
1885
2242
  return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
1886
2243
  }
1887
2244
 
1888
- static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2245
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
1889
2246
 
1890
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y];
1891
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1];
2247
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2248
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
1892
2249
 
1893
2250
  *x_ql = tile_x_ql;
1894
2251
  *x_dm = tile_x_dm;
1895
2252
  }
1896
2253
 
1897
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
2254
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
1898
2255
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
1899
2256
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
1900
2257
 
1901
2258
  __builtin_assume(i_offset >= 0);
1902
- __builtin_assume(i_offset < 8);
2259
+ __builtin_assume(i_offset < nwarps);
1903
2260
  __builtin_assume(k >= 0);
1904
2261
  __builtin_assume(k < WARP_SIZE);
1905
2262
 
@@ -1909,7 +2266,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1909
2266
  const block_q5_1 * bx0 = (block_q5_1 *) vx;
1910
2267
 
1911
2268
  #pragma unroll
1912
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2269
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
1913
2270
  int i = i0 + i_offset;
1914
2271
 
1915
2272
  if (need_check) {
@@ -1942,7 +2299,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
1942
2299
  const int kbxd = k % blocks_per_tile_x_row;
1943
2300
 
1944
2301
  #pragma unroll
1945
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) {
2302
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
1946
2303
  int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
1947
2304
 
1948
2305
  if (need_check) {
@@ -1959,13 +2316,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1959
2316
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
1960
2317
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
1961
2318
 
1962
- __builtin_assume(i >= 0);
1963
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
1964
- __builtin_assume(j >= 0);
1965
- __builtin_assume(j < WARP_SIZE);
1966
- __builtin_assume(k >= 0);
1967
- __builtin_assume(k < WARP_SIZE);
1968
-
1969
2319
  const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
1970
2320
  const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
1971
2321
 
@@ -1973,12 +2323,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
1973
2323
 
1974
2324
  #pragma unroll
1975
2325
  for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
1976
- u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l];
1977
- u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1];
2326
+ u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
2327
+ u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
1978
2328
  }
1979
2329
 
1980
2330
  return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
1981
- (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
2331
+ (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
1982
2332
  }
1983
2333
 
1984
2334
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
@@ -1989,29 +2339,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
1989
2339
  int v[VDR_Q8_0_Q8_1_MMVQ];
1990
2340
  int u[VDR_Q8_0_Q8_1_MMVQ];
1991
2341
 
2342
+ #pragma unroll
1992
2343
  for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
1993
2344
  v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
1994
2345
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
1995
2346
  }
1996
2347
 
1997
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
2348
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
1998
2349
  }
1999
2350
 
2000
- static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2351
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2001
2352
 
2002
- __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2003
- __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0];
2353
+ __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
2354
+ __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
2004
2355
 
2005
2356
  *x_ql = tile_x_qs;
2006
2357
  *x_dm = (half2 *) tile_x_d;
2007
2358
  }
2008
2359
 
2009
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2360
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
2010
2361
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2011
2362
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2012
2363
 
2013
2364
  __builtin_assume(i_offset >= 0);
2014
- __builtin_assume(i_offset < 8);
2365
+ __builtin_assume(i_offset < nwarps);
2015
2366
  __builtin_assume(k >= 0);
2016
2367
  __builtin_assume(k < WARP_SIZE);
2017
2368
 
@@ -2022,7 +2373,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2022
2373
  const block_q8_0 * bx0 = (block_q8_0 *) vx;
2023
2374
 
2024
2375
  #pragma unroll
2025
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2376
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2026
2377
  int i = i0 + i_offset;
2027
2378
 
2028
2379
  if (need_check) {
@@ -2032,76 +2383,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
2032
2383
  const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
2033
2384
 
2034
2385
  x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
2035
- x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
2036
2386
  }
2037
2387
 
2038
- // const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2039
- // const int kbxd = k % blocks_per_tile_x_row;
2388
+ const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
2389
+ const int kbxd = k % blocks_per_tile_x_row;
2040
2390
 
2041
- // #pragma unroll
2042
- // for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) {
2043
- // FIXME out-of-bounds
2044
- // const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2391
+ #pragma unroll
2392
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
2393
+ int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
2045
2394
 
2046
- // #if GGML_CUDA_MMQ_Y < 64
2047
- // if (i >= GGML_CUDA_MMQ_Y) {
2048
- // return;
2049
- // }
2050
- // #endif // GGML_CUDA_MMQ_Y < 64
2395
+ if (need_check) {
2396
+ i = min(i, i_max);
2397
+ }
2051
2398
 
2052
- // const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2399
+ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2053
2400
 
2054
- // x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d;
2055
- // }
2401
+ x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
2402
+ }
2056
2403
  }
2057
2404
 
2058
2405
  static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
2059
2406
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2060
2407
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2061
2408
 
2062
- __builtin_assume(i >= 0);
2063
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2064
- __builtin_assume(j >= 0);
2065
- __builtin_assume(j < WARP_SIZE);
2066
- __builtin_assume(k >= 0);
2067
- __builtin_assume(k < WARP_SIZE);
2068
-
2069
- const float * x_dmf = (float *) x_dm;
2409
+ const float * x_dmf = (const float *) x_dm;
2410
+ const float * y_df = (const float *) y_ds;
2070
2411
 
2071
2412
  return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
2072
2413
  (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
2073
- y_ds[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2074
- }
2075
-
2076
- #define VDR_q2_K_q8_1 1
2077
-
2078
- static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
2079
- const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2080
- const half2 & dm, const float * __restrict__ d8) {
2081
-
2082
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2083
- float sumf_d = 0.0f;
2084
- float sumf_m = 0.0f;
2085
-
2086
- for (int i = 0; i < QR2_K; ++i) {
2087
- const int sc = scales[2*i];
2088
-
2089
- const int vi = (v >> (2*i)) & 0x03030303;
2090
-
2091
- sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
2092
-
2093
- int sc_high = sc >> 4;
2094
- sc_high |= sc_high << 8;
2095
- sc_high |= sc_high << 16;
2096
- sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
2097
- }
2098
-
2099
- const float2 dmf = __half22float2(dm);
2100
-
2101
- return dmf.x*sumf_d - dmf.y*sumf_m;
2102
- #else
2103
- return 0.0f; // only to satisfy the compiler
2104
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2414
+ y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
2105
2415
  }
2106
2416
 
2107
2417
  static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
@@ -2115,34 +2425,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2115
2425
  const uint8_t * scales = bq2_K->scales + scale_offset;
2116
2426
 
2117
2427
  const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
2118
- int u[QR2_K];
2428
+ int u[QR2_K];
2119
2429
  float d8[QR2_K];
2120
2430
 
2431
+ #pragma unroll
2121
2432
  for (int i = 0; i < QR2_K; ++ i) {
2122
2433
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2123
2434
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2124
2435
  }
2125
2436
 
2126
- return vec_dot_q2_K_q8_1_impl(v, u, scales, bq2_K->dm, d8);
2437
+ return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
2127
2438
  }
2128
2439
 
2129
- static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2440
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2130
2441
 
2131
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2132
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K];
2133
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2442
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2443
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
2444
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2134
2445
 
2135
2446
  *x_ql = tile_x_ql;
2136
2447
  *x_dm = tile_x_dm;
2137
2448
  *x_sc = tile_x_sc;
2138
2449
  }
2139
2450
 
2140
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2451
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
2141
2452
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2142
2453
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2143
2454
 
2144
2455
  __builtin_assume(i_offset >= 0);
2145
- __builtin_assume(i_offset < 8);
2456
+ __builtin_assume(i_offset < nwarps);
2146
2457
  __builtin_assume(k >= 0);
2147
2458
  __builtin_assume(k < WARP_SIZE);
2148
2459
 
@@ -2152,7 +2463,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2152
2463
  const block_q2_K * bx0 = (block_q2_K *) vx;
2153
2464
 
2154
2465
  #pragma unroll
2155
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2466
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2156
2467
  int i = i0 + i_offset;
2157
2468
 
2158
2469
  if (need_check) {
@@ -2168,8 +2479,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2168
2479
  const int kbxd = k % blocks_per_tile_x_row;
2169
2480
 
2170
2481
  #pragma unroll
2171
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) {
2172
- int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2482
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
2483
+ int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
2173
2484
 
2174
2485
  if (need_check) {
2175
2486
  i = min(i, i_max);
@@ -2181,7 +2492,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
2181
2492
  }
2182
2493
 
2183
2494
  #pragma unroll
2184
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2495
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2185
2496
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2186
2497
 
2187
2498
  if (need_check) {
@@ -2198,68 +2509,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
2198
2509
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2199
2510
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2200
2511
 
2201
- __builtin_assume(i >= 0);
2202
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2203
- __builtin_assume(j >= 0);
2204
- __builtin_assume(j < WARP_SIZE);
2205
- __builtin_assume(k >= 0);
2206
- __builtin_assume(k < WARP_SIZE);
2207
-
2208
- const int kbx = k / QI2_K;
2209
- const int kqsx = k % QI2_K;
2210
-
2211
- const int bq8_offset = QR2_K * (kqsx / QI8_1);
2212
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2512
+ const int kbx = k / QI2_K;
2513
+ const int ky = (k % QI2_K) * QR2_K;
2514
+ const float * y_df = (const float *) y_ds;
2213
2515
 
2214
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16 + scale_offset;
2516
+ int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
2215
2517
 
2216
- int u[QR2_K];
2217
- float d8[QR2_K];
2518
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
2519
+ const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
2218
2520
 
2219
- for (int l = 0; l < QR2_K; ++ l) {
2220
- const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2221
- u[l] = y_qs[y_qs_index];
2222
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2521
+ #pragma unroll
2522
+ for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
2523
+ v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
2223
2524
  }
2224
2525
 
2225
- return vec_dot_q2_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], u, scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], d8);
2226
- }
2227
-
2228
- #define VDR_q3_K_q8_1 1
2229
-
2230
- static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
2231
- const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
2232
- const int & scale_offset, const float & d, const float * __restrict__ d8) {
2233
-
2234
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2235
- float sumf = 0.0f;
2236
-
2237
- for (int i = 0; i < QR3_K; ++i) {
2238
- const int isc = scale_offset + 2*i;
2239
-
2240
- const int isc_low = isc % (QK_K/32);
2241
- const int sc_shift_low = 4 * (isc / (QK_K/32));
2242
- const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
2243
-
2244
- const int isc_high = isc % (QK_K/64);
2245
- const int sc_shift_high = 2 * (isc / (QK_K/64));
2246
- const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
2247
-
2248
- const int sc = (sc_low | sc_high) - 32;
2249
-
2250
- const int vil = (vl >> (2*i)) & 0x03030303;
2251
-
2252
- const int vih = ((vh >> i) << 2) & 0x04040404;
2253
-
2254
- const int vi = __vsubss4(vil, vih);
2526
+ const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
2255
2527
 
2256
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
2257
- }
2258
-
2259
- return d*sumf;
2260
- #else
2261
- return 0.0f; // only to satisfy the compiler
2262
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2528
+ const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
2529
+ return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
2263
2530
  }
2264
2531
 
2265
2532
  static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
@@ -2277,23 +2544,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2277
2544
  // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2278
2545
  const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
2279
2546
 
2280
- int u[QR3_K];
2547
+ int u[QR3_K];
2281
2548
  float d8[QR3_K];
2282
2549
 
2550
+ #pragma unroll
2283
2551
  for (int i = 0; i < QR3_K; ++i) {
2284
2552
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2285
2553
  d8[i] = bq8_1[bq8_offset + i].ds.x;
2286
2554
  }
2287
2555
 
2288
- return vec_dot_q3_K_q8_1_impl(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2556
+ return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
2289
2557
  }
2290
2558
 
2291
- static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2559
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2292
2560
 
2293
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2294
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K];
2295
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2296
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2561
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2562
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
2563
+ __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
2564
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
2297
2565
 
2298
2566
  *x_ql = tile_x_ql;
2299
2567
  *x_dm = tile_x_dm;
@@ -2301,12 +2569,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
2301
2569
  *x_sc = tile_x_sc;
2302
2570
  }
2303
2571
 
2304
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2572
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
2305
2573
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2306
2574
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2307
2575
 
2308
2576
  __builtin_assume(i_offset >= 0);
2309
- __builtin_assume(i_offset < 8);
2577
+ __builtin_assume(i_offset < nwarps);
2310
2578
  __builtin_assume(k >= 0);
2311
2579
  __builtin_assume(k < WARP_SIZE);
2312
2580
 
@@ -2316,7 +2584,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2316
2584
  const block_q3_K * bx0 = (block_q3_K *) vx;
2317
2585
 
2318
2586
  #pragma unroll
2319
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2587
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2320
2588
  int i = i0 + i_offset;
2321
2589
 
2322
2590
  if (need_check) {
@@ -2330,10 +2598,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2330
2598
 
2331
2599
  const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
2332
2600
  const int kbxd = k % blocks_per_tile_x_row;
2601
+ float * x_dmf = (float *) x_dm;
2333
2602
 
2334
2603
  #pragma unroll
2335
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) {
2336
- int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2604
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
2605
+ int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
2337
2606
 
2338
2607
  if (need_check) {
2339
2608
  i = min(i, i_max);
@@ -2341,11 +2610,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2341
2610
 
2342
2611
  const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
2343
2612
 
2344
- x_dm[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd].x = bxi->d;
2613
+ x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
2345
2614
  }
2346
2615
 
2347
2616
  #pragma unroll
2348
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
2617
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
2349
2618
  int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
2350
2619
 
2351
2620
  if (need_check) {
@@ -2354,11 +2623,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2354
2623
 
2355
2624
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
2356
2625
 
2357
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2626
+ // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2627
+ x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
2358
2628
  }
2359
2629
 
2360
2630
  #pragma unroll
2361
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2631
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
2362
2632
  int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2363
2633
 
2364
2634
  if (need_check) {
@@ -2367,7 +2637,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
2367
2637
 
2368
2638
  const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
2369
2639
 
2370
- x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->scales, k % (QI3_K/4));
2640
+ const int ksc = k % (QI3_K/4);
2641
+
2642
+ const int ksc_low = ksc % (QI3_K/8);
2643
+ const int shift_low = 4 * (ksc / (QI3_K/8));
2644
+ const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
2645
+
2646
+ const int ksc_high = QI3_K/8;
2647
+ const int shift_high = 2 * ksc;
2648
+ const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
2649
+
2650
+ const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
2651
+
2652
+ x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
2371
2653
  }
2372
2654
  }
2373
2655
 
@@ -2375,63 +2657,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
2375
2657
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2376
2658
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2377
2659
 
2378
- __builtin_assume(i >= 0);
2379
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2380
- __builtin_assume(j >= 0);
2381
- __builtin_assume(j < WARP_SIZE);
2382
- __builtin_assume(k >= 0);
2383
- __builtin_assume(k < WARP_SIZE);
2384
-
2385
2660
  const int kbx = k / QI3_K;
2386
- const int kqsx = k % QI3_K;
2661
+ const int ky = (k % QI3_K) * QR3_K;
2662
+ const float * x_dmf = (const float *) x_dm;
2663
+ const float * y_df = (const float *) y_ds;
2387
2664
 
2388
- const int bq8_offset = QR3_K * (kqsx / (QI3_K/2));
2389
- const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
2665
+ const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
2390
2666
 
2391
- const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
2392
-
2393
- // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
2394
- const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
2395
-
2396
- int u[QR3_K];
2397
- float d8[QR3_K];
2398
-
2399
- for (int l = 0; l < QR3_K; ++ l) {
2400
- const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
2401
- u[l] = y_qs[y_qs_index];
2402
- d8[l] = y_ds[y_qs_index / QI8_1].x;
2403
- }
2404
-
2405
- return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
2406
- x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
2407
- }
2408
-
2409
- #define VDR_q4_K_q8_1 2
2410
-
2411
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
2412
- const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2413
- const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
2414
-
2415
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2416
- float sumf_d = 0.0f;
2417
- float sumf_m = 0.0f;
2667
+ int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
2418
2668
 
2419
- for (int i = 0; i < QR4_K; ++i) {
2420
- const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
2421
- const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
2669
+ #pragma unroll
2670
+ for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
2671
+ const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
2672
+ const int shift = 2 * ((ky % 32) / 8);
2673
+ const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
2422
2674
 
2423
- const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
2424
- const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
2675
+ const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
2676
+ const int vlh = (vh << 2) & 0x04040404;
2425
2677
 
2426
- sumf_d += d8[i] * (dot1 * sc[i]);
2427
- sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
2678
+ v[l] = __vsubss4(vll, vlh);
2428
2679
  }
2429
2680
 
2430
- return __half2float(dm4.x)*sumf_d - __half2float(dm4.y)*sumf_m;
2431
-
2432
- #else
2433
- return 0.0f; // only to satisfy the compiler
2434
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2681
+ const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
2682
+ return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
2435
2683
  }
2436
2684
 
2437
2685
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
@@ -2478,7 +2726,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2478
2726
  u[2*i+1] = q8[4];
2479
2727
  }
2480
2728
 
2481
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, bq4_K->dm, d8);
2729
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
2482
2730
 
2483
2731
  #else
2484
2732
 
@@ -2521,29 +2769,30 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2521
2769
  return dall * sumf_d - dmin * sumf_m;
2522
2770
 
2523
2771
  #else
2772
+ assert(false);
2524
2773
  return 0.0f; // only to satisfy the compiler
2525
2774
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2526
2775
 
2527
2776
  #endif
2528
2777
  }
2529
2778
 
2530
- static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2779
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2531
2780
 
2532
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2533
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K];
2534
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2781
+ __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
2782
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
2783
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2535
2784
 
2536
2785
  *x_ql = tile_x_ql;
2537
2786
  *x_dm = tile_x_dm;
2538
2787
  *x_sc = tile_x_sc;
2539
2788
  }
2540
2789
 
2541
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2790
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
2542
2791
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2543
2792
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2544
2793
 
2545
2794
  __builtin_assume(i_offset >= 0);
2546
- __builtin_assume(i_offset < 8);
2795
+ __builtin_assume(i_offset < nwarps);
2547
2796
  __builtin_assume(k >= 0);
2548
2797
  __builtin_assume(k < WARP_SIZE);
2549
2798
 
@@ -2553,7 +2802,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2553
2802
  const block_q4_K * bx0 = (block_q4_K *) vx;
2554
2803
 
2555
2804
  #pragma unroll
2556
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2805
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2557
2806
  int i = i0 + i_offset;
2558
2807
 
2559
2808
  if (need_check) {
@@ -2566,11 +2815,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2566
2815
  }
2567
2816
 
2568
2817
  const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
2569
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2818
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2570
2819
 
2571
2820
  #pragma unroll
2572
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
2573
- int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
2821
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
2822
+ int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
2574
2823
 
2575
2824
  if (need_check) {
2576
2825
  i = min(i, i_max);
@@ -2582,8 +2831,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2582
2831
  }
2583
2832
 
2584
2833
  #pragma unroll
2585
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2586
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
2834
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
2835
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2587
2836
 
2588
2837
  if (need_check) {
2589
2838
  i = min(i, i_max);
@@ -2591,90 +2840,27 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
2591
2840
 
2592
2841
  const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
2593
2842
 
2594
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
2595
- }
2596
- }
2597
-
2598
- static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2599
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2600
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2601
-
2602
- __builtin_assume(i >= 0);
2603
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2604
- __builtin_assume(j >= 0);
2605
- __builtin_assume(j < WARP_SIZE);
2606
- __builtin_assume(k >= 0);
2607
- __builtin_assume(k < WARP_SIZE);
2608
-
2609
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2610
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2611
-
2612
- int v[2];
2613
- int u[2*QR4_K];
2614
- float d8[QR4_K];
2843
+ const int * scales = (int *) bxi->scales;
2615
2844
 
2616
- // kqsx is in 0,2...30. bq8_offset = 2 * (kqsx/4) -> bq8_offset = 0, 2, 4, 6
2617
- const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
2845
+ const int ksc = k % (WARP_SIZE/8);
2618
2846
 
2619
- v[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2620
- v[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2847
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
2848
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
2849
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2621
2850
 
2622
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2623
- uint16_t aux[2];
2624
- const int l = bq8_offset/2;
2625
- if (l < 2) {
2626
- aux[0] = scales[l+0] & 0x3f3f;
2627
- aux[1] = scales[l+2] & 0x3f3f;
2628
- } else {
2629
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2630
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2851
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2631
2852
  }
2632
- const uint8_t * sc = (const uint8_t *)aux;
2633
- const uint8_t * m = sc + 2;
2634
-
2635
- for (int l = 0; l < QR4_K; ++l) {
2636
- const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2637
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2638
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2639
- d8[l] = y_ds[kqsy / QI8_1].x;
2640
- }
2641
-
2642
- return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
2643
2853
  }
2644
2854
 
2645
- #define VDR_q5_K_q8_1 2
2646
-
2647
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
2648
- const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
2649
- const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
2650
-
2651
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2652
- float sumf_d = 0.0f;
2653
- float sumf_m = 0.0f;
2654
-
2655
- for (int i = 0; i < QR5_K; ++i) {
2656
- const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
2657
- const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
2658
-
2659
- const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
2660
- const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
2661
-
2662
- const int v0i = vl0i | vh0i;
2663
- const int v1i = vl1i | vh1i;
2664
-
2665
- const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
2666
- const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
2667
-
2668
- sumf_d += d8[i] * (dot1 * sc[i]);
2669
- sumf_m += d8[i] * (dot2 * m[i]);
2670
-
2671
- }
2855
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2856
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2857
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2672
2858
 
2673
- return __half2float(dm5.x)*sumf_d - __half2float(dm5.y)*sumf_m;
2859
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2674
2860
 
2675
- #else
2676
- return 0.0f; // only to satisfy the compiler
2677
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2861
+ const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2678
2864
  }
2679
2865
 
2680
2866
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2711,6 +2897,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2711
2897
  const uint8_t * sc = (const uint8_t *)aux;
2712
2898
  const uint8_t * m = sc + 2;
2713
2899
 
2900
+ #pragma unroll
2714
2901
  for (int i = 0; i < QR5_K; ++i) {
2715
2902
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2716
2903
  d8[i] = bq8i->ds.x;
@@ -2720,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2720
2907
  u[2*i+1] = q8[4];
2721
2908
  }
2722
2909
 
2723
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2910
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2724
2911
 
2725
2912
  #else
2726
2913
 
@@ -2759,31 +2946,30 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2759
2946
  return d * sumf_d;
2760
2947
 
2761
2948
  #else
2949
+ assert(false);
2762
2950
  return 0.0f; // only to satisfy the compiler
2763
2951
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2764
2952
 
2765
2953
  #endif
2766
2954
  }
2767
2955
 
2768
- static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2956
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2769
2957
 
2770
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2771
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K];
2772
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4];
2773
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
2958
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
2959
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
2960
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2774
2961
 
2775
2962
  *x_ql = tile_x_ql;
2776
2963
  *x_dm = tile_x_dm;
2777
- *x_qh = tile_x_qh;
2778
2964
  *x_sc = tile_x_sc;
2779
2965
  }
2780
2966
 
2781
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2967
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
2782
2968
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2783
2969
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2784
2970
 
2785
2971
  __builtin_assume(i_offset >= 0);
2786
- __builtin_assume(i_offset < 8);
2972
+ __builtin_assume(i_offset < nwarps);
2787
2973
  __builtin_assume(k >= 0);
2788
2974
  __builtin_assume(k < WARP_SIZE);
2789
2975
 
@@ -2793,7 +2979,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2793
2979
  const block_q5_K * bx0 = (block_q5_K *) vx;
2794
2980
 
2795
2981
  #pragma unroll
2796
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
2982
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2797
2983
  int i = i0 + i_offset;
2798
2984
 
2799
2985
  if (need_check) {
@@ -2801,16 +2987,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2801
2987
  }
2802
2988
 
2803
2989
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
2990
+ const int ky = QR5_K*kqsx;
2804
2991
 
2805
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
2992
+ const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
2993
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
2994
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2995
+
2996
+ const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
2997
+ const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
2998
+ const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
2999
+
3000
+ const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
3001
+ const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
3002
+
3003
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
3004
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
2806
3005
  }
2807
3006
 
2808
3007
  const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
2809
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3008
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2810
3009
 
2811
3010
  #pragma unroll
2812
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) {
2813
- int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3011
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
3012
+ int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
2814
3013
 
2815
3014
  if (need_check) {
2816
3015
  i = min(i, i_max);
@@ -2822,107 +3021,37 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
2822
3021
  }
2823
3022
 
2824
3023
  #pragma unroll
2825
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) {
2826
- int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
2827
-
2828
- if (need_check) {
2829
- i = min(i, i_max);
2830
- }
2831
-
2832
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
2833
-
2834
- x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
2835
- }
2836
-
2837
- #pragma unroll
2838
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
2839
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3024
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3025
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
2840
3026
 
2841
3027
  if (need_check) {
2842
3028
  i = min(i, i_max);
2843
- }
2844
-
2845
- const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2846
-
2847
- x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
2848
- }
2849
- }
2850
-
2851
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
2852
- const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2853
- const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2854
-
2855
- __builtin_assume(i >= 0);
2856
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
2857
- __builtin_assume(j >= 0);
2858
- __builtin_assume(j < WARP_SIZE);
2859
- __builtin_assume(k >= 0);
2860
- __builtin_assume(k < WARP_SIZE);
2861
-
2862
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
2863
- const int kqsx = k % QI6_K; // == k if QK_K == 256
2864
-
2865
- int vl[2];
2866
- int vh[2];
2867
- int u[2*QR4_K];
2868
- float d8[QR4_K];
2869
-
2870
- const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
2871
-
2872
- vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
2873
- vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
2874
-
2875
- vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
2876
- vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
2877
-
2878
- const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
2879
- uint16_t aux[2];
2880
- const int l = bq8_offset/2;
2881
- if (l < 2) {
2882
- aux[0] = scales[l+0] & 0x3f3f;
2883
- aux[1] = scales[l+2] & 0x3f3f;
2884
- } else {
2885
- aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
2886
- aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
2887
- }
2888
- const uint8_t * sc = (const uint8_t *)aux;
2889
- const uint8_t * m = sc + 2;
2890
-
2891
- for (int l = 0; l < QR5_K; ++l) {
2892
- const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
2893
- u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
2894
- u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
2895
- d8[l] = y_ds[kqsy / QI8_1].x;
2896
- }
2897
-
2898
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
2899
- }
2900
-
2901
- #define VDR_q6_K_q8_1 1
2902
-
2903
- static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
2904
- const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
2905
- const float & d, const float * __restrict__ d8) {
2906
-
2907
- #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
2908
- float sumf = 0.0f;
3029
+ }
2909
3030
 
2910
- for (int i = 0; i < QR6_K; ++i) {
2911
- const int sc = scales[4*i];
3031
+ const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
2912
3032
 
2913
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
3033
+ const int * scales = (int *) bxi->scales;
2914
3034
 
2915
- const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
3035
+ const int ksc = k % (WARP_SIZE/8);
2916
3036
 
2917
- const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
3037
+ // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
3038
+ int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
3039
+ scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
2918
3040
 
2919
- sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
3041
+ x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
2920
3042
  }
3043
+ }
2921
3044
 
2922
- return d*sumf;
2923
- #else
2924
- return 0.0f; // only to satisfy the compiler
2925
- #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3045
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3046
+ const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3047
+ const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3048
+
3049
+ const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
3050
+
3051
+ const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3052
+ const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3053
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3054
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
2926
3055
  }
2927
3056
 
2928
3057
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -2942,33 +3071,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
2942
3071
  int u[QR6_K];
2943
3072
  float d8[QR6_K];
2944
3073
 
3074
+ #pragma unroll
2945
3075
  for (int i = 0; i < QR6_K; ++i) {
2946
3076
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
2947
3077
  d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
2948
3078
  }
2949
3079
 
2950
- return vec_dot_q6_K_q8_1_impl(vl, vh, u, scales, bq6_K->d, d8);
3080
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
2951
3081
  }
2952
3082
 
2953
- static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
3083
+ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
2954
3084
 
2955
- __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y];
2956
- __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K];
2957
- __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2];
2958
- __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
3085
+ __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
3086
+ __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
3087
+ __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
2959
3088
 
2960
3089
  *x_ql = tile_x_ql;
2961
3090
  *x_dm = tile_x_dm;
2962
- *x_qh = tile_x_qh;
2963
3091
  *x_sc = tile_x_sc;
2964
3092
  }
2965
3093
 
2966
- template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
3094
+ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
2967
3095
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2968
3096
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2969
3097
 
2970
3098
  __builtin_assume(i_offset >= 0);
2971
- __builtin_assume(i_offset < 8);
3099
+ __builtin_assume(i_offset < nwarps);
2972
3100
  __builtin_assume(k >= 0);
2973
3101
  __builtin_assume(k < WARP_SIZE);
2974
3102
 
@@ -2978,7 +3106,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2978
3106
  const block_q6_K * bx0 = (block_q6_K *) vx;
2979
3107
 
2980
3108
  #pragma unroll
2981
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) {
3109
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
2982
3110
  int i = i0 + i_offset;
2983
3111
 
2984
3112
  if (need_check) {
@@ -2986,42 +3114,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
2986
3114
  }
2987
3115
 
2988
3116
  const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
3117
+ const int ky = QR6_K*kqsx;
2989
3118
 
2990
- x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->ql, kqsx);
2991
- }
2992
-
2993
- const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
2994
- const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
2995
-
2996
- #pragma unroll
2997
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
2998
- int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
3119
+ const int ql = get_int_from_uint8(bxi->ql, kqsx);
3120
+ const int ql0 = (ql >> 0) & 0x0F0F0F0F;
3121
+ const int ql1 = (ql >> 4) & 0x0F0F0F0F;
2999
3122
 
3000
- if (need_check) {
3001
- i = min(i, i_max);
3002
- }
3123
+ const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
3124
+ const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
3125
+ const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
3003
3126
 
3004
- const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3127
+ const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
3128
+ const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
3005
3129
 
3006
- x_dm[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd].x = bxi->d;
3130
+ x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
3131
+ x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
3007
3132
  }
3008
3133
 
3134
+ const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
3135
+ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
3136
+ float * x_dmf = (float *) x_dm;
3137
+
3009
3138
  #pragma unroll
3010
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) {
3011
- int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
3139
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
3140
+ int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
3012
3141
 
3013
3142
  if (need_check) {
3014
3143
  i = min(i, i_max);
3015
3144
  }
3016
3145
 
3017
- const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI6_K/2);
3146
+ const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
3018
3147
 
3019
- x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = get_int_from_uint8(bxi->qh, k % (QI6_K/2));
3148
+ x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
3020
3149
  }
3021
3150
 
3022
3151
  #pragma unroll
3023
- for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
3024
- int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
3152
+ for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
3153
+ int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
3025
3154
 
3026
3155
  if (need_check) {
3027
3156
  i = min(i, i_max);
@@ -3037,41 +3166,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3037
3166
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
3038
3167
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
3039
3168
 
3040
- __builtin_assume(i >= 0);
3041
- __builtin_assume(i < GGML_CUDA_MMQ_Y);
3042
- __builtin_assume(j >= 0);
3043
- __builtin_assume(j < WARP_SIZE);
3044
- __builtin_assume(k >= 0);
3045
- __builtin_assume(k < WARP_SIZE);
3046
-
3047
- const int kbx = k / QI6_K; // == 0 if QK_K == 256
3048
- const int kqsx = k % QI6_K; // == k if QK_K == 256
3049
-
3050
- const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
3051
- const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
3052
- const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
3169
+ const float * x_dmf = (const float *) x_dm;
3170
+ const float * y_df = (const float *) y_ds;
3053
3171
 
3054
- const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
3172
+ const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
3055
3173
 
3056
- const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
3057
- const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
3058
-
3059
- int u[QR6_K];
3060
- float d8[QR6_K];
3061
-
3062
- for (int l = 0; l < QR6_K; ++l) {
3063
- const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
3064
- u[l] = y_qs[kqsy];
3065
- d8[l] = y_ds[kqsy / QI8_1].x;
3066
- }
3067
-
3068
- return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
3069
- x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
3174
+ const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
3175
+ const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
3176
+ return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
3070
3177
  }
3071
3178
 
3072
- template <int qk, int qr, int qi, typename block_q_t,
3179
+ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3073
3180
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3074
- static __global__ void mul_mat_q(
3181
+ static __device__ __forceinline__ void mul_mat_q(
3075
3182
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3076
3183
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3077
3184
 
@@ -3084,14 +3191,10 @@ static __global__ void mul_mat_q(
3084
3191
 
3085
3192
  const int & ncols_dst = ncols_y;
3086
3193
 
3087
- const int tid_x = threadIdx.x;
3088
- const int tid_y = threadIdx.y;
3089
-
3090
- const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
3194
+ const int row_dst_0 = blockIdx.x*mmq_y;
3091
3195
  const int & row_x_0 = row_dst_0;
3092
- const int row_dst = row_dst_0 + tid_x;
3093
3196
 
3094
- const int col_dst_0 = blockIdx.y*WARP_SIZE;
3197
+ const int col_dst_0 = blockIdx.y*mmq_x;
3095
3198
  const int & col_y_0 = col_dst_0;
3096
3199
 
3097
3200
  int * tile_x_ql = nullptr;
@@ -3101,75 +3204,444 @@ static __global__ void mul_mat_q(
3101
3204
 
3102
3205
  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
3103
3206
 
3104
- const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1;
3105
-
3106
- __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
3107
- __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
3207
+ __shared__ int tile_y_qs[mmq_x * WARP_SIZE];
3208
+ __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
3108
3209
 
3109
- float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f};
3210
+ float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
3110
3211
 
3111
3212
  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
3112
3213
 
3113
3214
  load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
3114
- tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x);
3215
+ threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
3115
3216
 
3217
+ #pragma unroll
3116
3218
  for (int ir = 0; ir < qr; ++ir) {
3117
- const int kqs = ir*WARP_SIZE + tid_x;
3219
+ const int kqs = ir*WARP_SIZE + threadIdx.x;
3118
3220
  const int kbxd = kqs / QI8_1;
3119
3221
 
3120
- for (int i = 0; i < WARP_SIZE; i += 8) {
3121
- const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3222
+ #pragma unroll
3223
+ for (int i = 0; i < mmq_x; i += nwarps) {
3224
+ const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
3122
3225
 
3123
3226
  const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
3124
3227
 
3125
- tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1);
3228
+ const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
3229
+ tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
3126
3230
  }
3127
- }
3128
3231
 
3129
- for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) {
3130
- const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE;
3131
- const int kby = tid_x % blocks_per_tile_y_col;
3132
- const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3133
- tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby] = y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds;
3134
- }
3232
+ #pragma unroll
3233
+ for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
3234
+ const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
3235
+ const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
3236
+ const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
3237
+
3238
+ // if the sum is not needed it's faster to transform the scale to f32 ahead of time
3239
+ const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
3240
+ half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
3241
+ if (need_sum) {
3242
+ *dsi_dst = *dsi_src;
3243
+ } else {
3244
+ float * dfi_dst = (float *) dsi_dst;
3245
+ *dfi_dst = (*dsi_src).x;
3246
+ }
3247
+ }
3135
3248
 
3136
- __syncthreads();
3249
+ __syncthreads();
3137
3250
 
3138
- #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
3139
- #pragma unroll
3140
- #endif // __CUDA_ARCH__ >= 700
3141
- for (int k = 0; k < WARP_SIZE; k += vdr) {
3251
+ // #pragma unroll // unrolling this loop causes too much register pressure
3252
+ for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
3142
3253
  #pragma unroll
3143
- for (int j = 0; j < WARP_SIZE; j += 8) {
3254
+ for (int j = 0; j < mmq_x; j += nwarps) {
3144
3255
  #pragma unroll
3145
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3146
- sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3147
- tid_x + i, tid_y + j, k);
3256
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3257
+ sum[i/WARP_SIZE][j/nwarps] += vec_dot(
3258
+ tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
3259
+ threadIdx.x + i, threadIdx.y + j, k);
3260
+ }
3148
3261
  }
3149
3262
  }
3150
- }
3151
-
3152
- __syncthreads();
3153
- }
3154
3263
 
3155
-
3156
- if (row_dst >= nrows_dst) {
3157
- return;
3264
+ __syncthreads();
3265
+ }
3158
3266
  }
3159
3267
 
3160
- for (int j = 0; j < WARP_SIZE; j += 8) {
3161
- const int col_dst = col_dst_0 + j + tid_y;
3268
+ #pragma unroll
3269
+ for (int j = 0; j < mmq_x; j += nwarps) {
3270
+ const int col_dst = col_dst_0 + j + threadIdx.y;
3162
3271
 
3163
3272
  if (col_dst >= ncols_dst) {
3164
3273
  return;
3165
3274
  }
3166
3275
 
3167
- for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) {
3168
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8];
3276
+ #pragma unroll
3277
+ for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3278
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3279
+
3280
+ if (row_dst >= nrows_dst) {
3281
+ continue;
3282
+ }
3283
+
3284
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3169
3285
  }
3170
3286
  }
3171
3287
  }
3172
3288
 
3289
+ #define MMQ_X_Q4_0_AMPERE 64
3290
+ #define MMQ_Y_Q4_0_AMPERE 128
3291
+ #define NWARPS_Q4_0_AMPERE 4
3292
+ #define MMQ_X_Q4_0_PASCAL 64
3293
+ #define MMQ_Y_Q4_0_PASCAL 64
3294
+ #define NWARPS_Q4_0_PASCAL 8
3295
+
3296
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3297
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3298
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3299
+
3300
+ #if __CUDA_ARCH__ >= CC_TURING
3301
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3302
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3303
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3304
+
3305
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3306
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3307
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3308
+
3309
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3310
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3311
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3312
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3313
+
3314
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3315
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3316
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3317
+ #else
3318
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3319
+ assert(false);
3320
+ #endif // __CUDA_ARCH__ >= CC_TURING
3321
+ }
3322
+
3323
+ #define MMQ_X_Q4_1_AMPERE 64
3324
+ #define MMQ_Y_Q4_1_AMPERE 128
3325
+ #define NWARPS_Q4_1_AMPERE 4
3326
+ #define MMQ_X_Q4_1_PASCAL 64
3327
+ #define MMQ_Y_Q4_1_PASCAL 64
3328
+ #define NWARPS_Q4_1_PASCAL 8
3329
+
3330
+ template <bool need_check> static __global__ void
3331
+ #if __CUDA_ARCH__ < CC_TURING
3332
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3333
+ #endif // __CUDA_ARCH__ < CC_TURING
3334
+ mul_mat_q4_1(
3335
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3336
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3337
+
3338
+ #if __CUDA_ARCH__ >= CC_TURING
3339
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3340
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3341
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3342
+
3343
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3344
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3345
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3346
+
3347
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3348
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3349
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3350
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3351
+
3352
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3353
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3354
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3355
+ #else
3356
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3357
+ assert(false);
3358
+ #endif // __CUDA_ARCH__ >= CC_TURING
3359
+ }
3360
+
3361
+ #define MMQ_X_Q5_0_AMPERE 128
3362
+ #define MMQ_Y_Q5_0_AMPERE 64
3363
+ #define NWARPS_Q5_0_AMPERE 4
3364
+ #define MMQ_X_Q5_0_PASCAL 64
3365
+ #define MMQ_Y_Q5_0_PASCAL 64
3366
+ #define NWARPS_Q5_0_PASCAL 8
3367
+
3368
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3369
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3370
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3371
+
3372
+ #if __CUDA_ARCH__ >= CC_TURING
3373
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3374
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3375
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3376
+
3377
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3378
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3379
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3380
+
3381
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3382
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3383
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3384
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3385
+
3386
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3387
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3388
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3389
+ #else
3390
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3391
+ assert(false);
3392
+ #endif // __CUDA_ARCH__ >= CC_TURING
3393
+ }
3394
+
3395
+ #define MMQ_X_Q5_1_AMPERE 128
3396
+ #define MMQ_Y_Q5_1_AMPERE 64
3397
+ #define NWARPS_Q5_1_AMPERE 4
3398
+ #define MMQ_X_Q5_1_PASCAL 64
3399
+ #define MMQ_Y_Q5_1_PASCAL 64
3400
+ #define NWARPS_Q5_1_PASCAL 8
3401
+
3402
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3403
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3404
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3405
+
3406
+ #if __CUDA_ARCH__ >= CC_TURING
3407
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3408
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3409
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3410
+
3411
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3412
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3413
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3414
+
3415
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3416
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3417
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3418
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3419
+
3420
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3421
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3422
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3423
+ #else
3424
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3425
+ assert(false);
3426
+ #endif // __CUDA_ARCH__ >= CC_TURING
3427
+ }
3428
+
3429
+ #define MMQ_X_Q8_0_AMPERE 128
3430
+ #define MMQ_Y_Q8_0_AMPERE 64
3431
+ #define NWARPS_Q8_0_AMPERE 4
3432
+ #define MMQ_X_Q8_0_PASCAL 64
3433
+ #define MMQ_Y_Q8_0_PASCAL 64
3434
+ #define NWARPS_Q8_0_PASCAL 8
3435
+
3436
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3443
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3444
+
3445
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3446
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3452
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3453
+
3454
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3455
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q2_K_AMPERE 64
3464
+ #define MMQ_Y_Q2_K_AMPERE 128
3465
+ #define NWARPS_Q2_K_AMPERE 4
3466
+ #define MMQ_X_Q2_K_PASCAL 64
3467
+ #define MMQ_Y_Q2_K_PASCAL 64
3468
+ #define NWARPS_Q2_K_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3477
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3478
+
3479
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3480
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3486
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3487
+
3488
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3489
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q3_K_AMPERE 128
3498
+ #define MMQ_Y_Q3_K_AMPERE 128
3499
+ #define NWARPS_Q3_K_AMPERE 4
3500
+ #define MMQ_X_Q3_K_PASCAL 64
3501
+ #define MMQ_Y_Q3_K_PASCAL 64
3502
+ #define NWARPS_Q3_K_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void
3505
+ #if __CUDA_ARCH__ < CC_TURING
3506
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3507
+ #endif // __CUDA_ARCH__ < CC_TURING
3508
+ mul_mat_q3_K(
3509
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3510
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3511
+
3512
+ #if __CUDA_ARCH__ >= CC_TURING
3513
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3514
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3515
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3516
+
3517
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3518
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3520
+
3521
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3522
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3523
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3524
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3525
+
3526
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3527
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3528
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3529
+ #else
3530
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3531
+ assert(false);
3532
+ #endif // __CUDA_ARCH__ >= CC_TURING
3533
+ }
3534
+
3535
+ #define MMQ_X_Q4_K_AMPERE 64
3536
+ #define MMQ_Y_Q4_K_AMPERE 128
3537
+ #define NWARPS_Q4_K_AMPERE 4
3538
+ #define MMQ_X_Q4_K_PASCAL 64
3539
+ #define MMQ_Y_Q4_K_PASCAL 64
3540
+ #define NWARPS_Q4_K_PASCAL 8
3541
+
3542
+ template <bool need_check> static __global__ void
3543
+ #if __CUDA_ARCH__ < CC_TURING
3544
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3545
+ #endif // __CUDA_ARCH__ < CC_TURING
3546
+ mul_mat_q4_K(
3547
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3548
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3549
+
3550
+ #if __CUDA_ARCH__ >= CC_TURING
3551
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3552
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3553
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3554
+
3555
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3556
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3557
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3558
+
3559
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3560
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3561
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3562
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3563
+
3564
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3565
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3566
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3567
+ #else
3568
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3569
+ assert(false);
3570
+ #endif // __CUDA_ARCH__ >= CC_TURING
3571
+ }
3572
+
3573
+ #define MMQ_X_Q5_K_AMPERE 64
3574
+ #define MMQ_Y_Q5_K_AMPERE 128
3575
+ #define NWARPS_Q5_K_AMPERE 4
3576
+ #define MMQ_X_Q5_K_PASCAL 64
3577
+ #define MMQ_Y_Q5_K_PASCAL 64
3578
+ #define NWARPS_Q5_K_PASCAL 8
3579
+
3580
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3581
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3583
+
3584
+ #if __CUDA_ARCH__ >= CC_TURING
3585
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3586
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3587
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3588
+
3589
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3590
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3591
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3592
+
3593
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3594
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3595
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3596
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3597
+
3598
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3599
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3600
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3601
+ #else
3602
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3603
+ assert(false);
3604
+ #endif // __CUDA_ARCH__ >= CC_TURING
3605
+ }
3606
+
3607
+ #define MMQ_X_Q6_K_AMPERE 64
3608
+ #define MMQ_Y_Q6_K_AMPERE 64
3609
+ #define NWARPS_Q6_K_AMPERE 4
3610
+ #define MMQ_X_Q6_K_PASCAL 64
3611
+ #define MMQ_Y_Q6_K_PASCAL 64
3612
+ #define NWARPS_Q6_K_PASCAL 8
3613
+
3614
+ template <bool need_check> static __global__ void
3615
+ #if __CUDA_ARCH__ < CC_TURING
3616
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3617
+ #endif // __CUDA_ARCH__ < CC_TURING
3618
+ mul_mat_q6_K(
3619
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3620
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3621
+
3622
+ #if __CUDA_ARCH__ >= CC_TURING
3623
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3624
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3625
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3626
+
3627
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3628
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3629
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3630
+
3631
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3632
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3633
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3634
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3635
+
3636
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3637
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3638
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3639
+ #else
3640
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3641
+ assert(false);
3642
+ #endif // __CUDA_ARCH__ >= CC_TURING
3643
+ }
3644
+
3173
3645
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3174
3646
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3175
3647
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3780,7 +4252,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
3780
4252
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3781
4253
  const dim3 block_nums(1, block_num_y, 1);
3782
4254
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3783
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1>
4255
+ mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
3784
4256
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3785
4257
  }
3786
4258
 
@@ -3789,7 +4261,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
3789
4261
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3790
4262
  const dim3 block_nums(1, block_num_y, 1);
3791
4263
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3792
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1>
4264
+ mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
3793
4265
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3794
4266
  }
3795
4267
 
@@ -3798,7 +4270,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
3798
4270
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3799
4271
  const dim3 block_nums(1, block_num_y, 1);
3800
4272
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3801
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1>
4273
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
3802
4274
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3803
4275
  }
3804
4276
 
@@ -3807,7 +4279,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
3807
4279
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3808
4280
  const dim3 block_nums(1, block_num_y, 1);
3809
4281
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3810
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1>
4282
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
3811
4283
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3812
4284
  }
3813
4285
 
@@ -3816,7 +4288,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
3816
4288
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
3817
4289
  const dim3 block_nums(1, block_num_y, 1);
3818
4290
  const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
3819
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1>
4291
+ mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
3820
4292
  <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
3821
4293
  }
3822
4294
 
@@ -3867,17 +4339,36 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3867
4339
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3868
4340
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3869
4341
 
3870
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3871
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4342
+ int id;
4343
+ CUDA_CHECK(cudaGetDevice(&id));
4344
+ const int compute_capability = g_compute_capabilities[id];
4345
+
4346
+ int mmq_x, mmq_y, nwarps;
4347
+ if (compute_capability >= CC_TURING) {
4348
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4349
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4350
+ nwarps = NWARPS_Q4_0_AMPERE;
4351
+ } else if (compute_capability >= MIN_CC_DP4A) {
4352
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4353
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4354
+ nwarps = NWARPS_Q4_0_PASCAL;
4355
+ } else {
4356
+ GGML_ASSERT(false);
4357
+ }
4358
+
4359
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4360
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3872
4361
  const dim3 block_nums(block_num_x, block_num_y, 1);
3873
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4362
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3874
4363
 
3875
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3876
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<false>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3877
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4364
+ if (nrows_x % mmq_y == 0) {
4365
+ const bool need_check = false;
4366
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4367
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3878
4368
  } else {
3879
- mul_mat_q<QK4_0, QR4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0<true>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3880
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4369
+ const bool need_check = true;
4370
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4371
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3881
4372
  }
3882
4373
  }
3883
4374
 
@@ -3885,17 +4376,36 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3885
4376
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3886
4377
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3887
4378
 
3888
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3889
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4379
+ int id;
4380
+ CUDA_CHECK(cudaGetDevice(&id));
4381
+ const int compute_capability = g_compute_capabilities[id];
4382
+
4383
+ int mmq_x, mmq_y, nwarps;
4384
+ if (compute_capability >= CC_TURING) {
4385
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4386
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4387
+ nwarps = NWARPS_Q4_1_AMPERE;
4388
+ } else if (compute_capability >= MIN_CC_DP4A) {
4389
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4390
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4391
+ nwarps = NWARPS_Q4_1_PASCAL;
4392
+ } else {
4393
+ GGML_ASSERT(false);
4394
+ }
4395
+
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3890
4398
  const dim3 block_nums(block_num_x, block_num_y, 1);
3891
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3892
4400
 
3893
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3894
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<false>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3895
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4404
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3896
4405
  } else {
3897
- mul_mat_q<QK4_1, QR4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1<true>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3898
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
+ const bool need_check = true;
4407
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4408
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3899
4409
  }
3900
4410
  }
3901
4411
 
@@ -3903,17 +4413,36 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
3903
4413
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3904
4414
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3905
4415
 
3906
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3907
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4416
+ int id;
4417
+ CUDA_CHECK(cudaGetDevice(&id));
4418
+ const int compute_capability = g_compute_capabilities[id];
4419
+
4420
+ int mmq_x, mmq_y, nwarps;
4421
+ if (compute_capability >= CC_TURING) {
4422
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4423
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4424
+ nwarps = NWARPS_Q5_0_AMPERE;
4425
+ } else if (compute_capability >= MIN_CC_DP4A) {
4426
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4427
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4428
+ nwarps = NWARPS_Q5_0_PASCAL;
4429
+ } else {
4430
+ GGML_ASSERT(false);
4431
+ }
4432
+
4433
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4434
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3908
4435
  const dim3 block_nums(block_num_x, block_num_y, 1);
3909
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4436
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3910
4437
 
3911
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3912
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<false>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3913
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
+ if (nrows_x % mmq_y == 0) {
4439
+ const bool need_check = false;
4440
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4441
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3914
4442
  } else {
3915
- mul_mat_q<QK5_0, QR5_0, QI5_0, block_q5_0, allocate_tiles_q5_0, load_tiles_q5_0<true>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3916
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
+ const bool need_check = true;
4444
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4445
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3917
4446
  }
3918
4447
  }
3919
4448
 
@@ -3921,17 +4450,36 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
3921
4450
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3922
4451
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3923
4452
 
3924
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3925
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4453
+ int id;
4454
+ CUDA_CHECK(cudaGetDevice(&id));
4455
+ const int compute_capability = g_compute_capabilities[id];
4456
+
4457
+ int mmq_x, mmq_y, nwarps;
4458
+ if (compute_capability >= CC_TURING) {
4459
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4460
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4461
+ nwarps = NWARPS_Q5_1_AMPERE;
4462
+ } else if (compute_capability >= MIN_CC_DP4A) {
4463
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4464
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4465
+ nwarps = NWARPS_Q5_1_PASCAL;
4466
+ } else {
4467
+ GGML_ASSERT(false);
4468
+ }
4469
+
4470
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4471
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3926
4472
  const dim3 block_nums(block_num_x, block_num_y, 1);
3927
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4473
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3928
4474
 
3929
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3930
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<false>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3931
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4475
+ if (nrows_x % mmq_y == 0) {
4476
+ const bool need_check = false;
4477
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4478
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3932
4479
  } else {
3933
- mul_mat_q<QK5_1, QR5_1, QI5_1, block_q5_1, allocate_tiles_q5_1, load_tiles_q5_1<true>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3934
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4480
+ const bool need_check = true;
4481
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4482
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3935
4483
  }
3936
4484
  }
3937
4485
 
@@ -3939,17 +4487,36 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
3939
4487
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3940
4488
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3941
4489
 
3942
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3943
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4490
+ int id;
4491
+ CUDA_CHECK(cudaGetDevice(&id));
4492
+ const int compute_capability = g_compute_capabilities[id];
4493
+
4494
+ int mmq_x, mmq_y, nwarps;
4495
+ if (compute_capability >= CC_TURING) {
4496
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4498
+ nwarps = NWARPS_Q8_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4502
+ nwarps = NWARPS_Q8_0_PASCAL;
4503
+ } else {
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3944
4509
  const dim3 block_nums(block_num_x, block_num_y, 1);
3945
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3946
4511
 
3947
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3948
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<false>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3949
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3950
4516
  } else {
3951
- mul_mat_q<QK8_0, QR8_0, QI8_0, block_q8_0, allocate_tiles_q8_0, load_tiles_q8_0<true>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3952
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4517
+ const bool need_check = true;
4518
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3953
4520
  }
3954
4521
  }
3955
4522
 
@@ -3957,17 +4524,36 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
3957
4524
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3958
4525
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3959
4526
 
3960
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3961
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4527
+ int id;
4528
+ CUDA_CHECK(cudaGetDevice(&id));
4529
+ const int compute_capability = g_compute_capabilities[id];
4530
+
4531
+ int mmq_x, mmq_y, nwarps;
4532
+ if (compute_capability >= CC_TURING) {
4533
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4534
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4535
+ nwarps = NWARPS_Q2_K_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4538
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4539
+ nwarps = NWARPS_Q2_K_PASCAL;
4540
+ } else {
4541
+ GGML_ASSERT(false);
4542
+ }
4543
+
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3962
4546
  const dim3 block_nums(block_num_x, block_num_y, 1);
3963
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3964
4548
 
3965
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3966
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<false>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3967
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
4553
  } else {
3969
- mul_mat_q<QK_K, QR2_K, QI2_K, block_q2_K, allocate_tiles_q2_K, load_tiles_q2_K<true>, VDR_q2_K_q8_1, vec_dot_q2_K_q8_1_mul_mat>
3970
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4554
+ const bool need_check = true;
4555
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3971
4557
  }
3972
4558
  }
3973
4559
 
@@ -3975,17 +4561,36 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
3975
4561
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3976
4562
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3977
4563
 
3978
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3979
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4564
+ int id;
4565
+ CUDA_CHECK(cudaGetDevice(&id));
4566
+ const int compute_capability = g_compute_capabilities[id];
4567
+
4568
+ int mmq_x, mmq_y, nwarps;
4569
+ if (compute_capability >= CC_TURING) {
4570
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4571
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4572
+ nwarps = NWARPS_Q3_K_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4575
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4576
+ nwarps = NWARPS_Q3_K_PASCAL;
4577
+ } else {
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3980
4583
  const dim3 block_nums(block_num_x, block_num_y, 1);
3981
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
3982
4585
 
3983
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
3984
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<false>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
4590
  } else {
3987
- mul_mat_q<QK_K, QR3_K, QI3_K, block_q3_K, allocate_tiles_q3_K, load_tiles_q3_K<true>, VDR_q3_K_q8_1, vec_dot_q3_K_q8_1_mul_mat>
3988
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4591
+ const bool need_check = true;
4592
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3989
4594
  }
3990
4595
  }
3991
4596
 
@@ -3993,17 +4598,36 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
3993
4598
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
3994
4599
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
3995
4600
 
3996
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
3997
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4601
+ int id;
4602
+ CUDA_CHECK(cudaGetDevice(&id));
4603
+ const int compute_capability = g_compute_capabilities[id];
4604
+
4605
+ int mmq_x, mmq_y, nwarps;
4606
+ if (compute_capability >= CC_TURING) {
4607
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4608
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4609
+ nwarps = NWARPS_Q4_K_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4612
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4613
+ nwarps = NWARPS_Q4_K_PASCAL;
4614
+ } else {
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3998
4620
  const dim3 block_nums(block_num_x, block_num_y, 1);
3999
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4000
4622
 
4001
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4002
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<false>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4003
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4004
4627
  } else {
4005
- mul_mat_q<QK_K, QR4_K, QI4_K, block_q4_K, allocate_tiles_q4_K, load_tiles_q4_K<true>, VDR_q4_K_q8_1, vec_dot_q4_K_q8_1_mul_mat>
4006
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4628
+ const bool need_check = true;
4629
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4007
4631
  }
4008
4632
  }
4009
4633
 
@@ -4011,17 +4635,36 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4011
4635
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4012
4636
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4013
4637
 
4014
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4015
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4638
+ int id;
4639
+ CUDA_CHECK(cudaGetDevice(&id));
4640
+ const int compute_capability = g_compute_capabilities[id];
4641
+
4642
+ int mmq_x, mmq_y, nwarps;
4643
+ if (compute_capability >= CC_TURING) {
4644
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4645
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4646
+ nwarps = NWARPS_Q5_K_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4649
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4650
+ nwarps = NWARPS_Q5_K_PASCAL;
4651
+ } else {
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4016
4657
  const dim3 block_nums(block_num_x, block_num_y, 1);
4017
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4018
4659
 
4019
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4020
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<false>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4021
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4022
4664
  } else {
4023
- mul_mat_q<QK_K, QR5_K, QI5_K, block_q5_K, allocate_tiles_q5_K, load_tiles_q5_K<true>, VDR_q5_K_q8_1, vec_dot_q5_K_q8_1_mul_mat>
4024
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4665
+ const bool need_check = true;
4666
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4025
4668
  }
4026
4669
  }
4027
4670
 
@@ -4029,17 +4672,36 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4029
4672
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4030
4673
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4031
4674
 
4032
- const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y;
4033
- const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
4675
+ int id;
4676
+ CUDA_CHECK(cudaGetDevice(&id));
4677
+ const int compute_capability = g_compute_capabilities[id];
4678
+
4679
+ int mmq_x, mmq_y, nwarps;
4680
+ if (compute_capability >= CC_TURING) {
4681
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4683
+ nwarps = NWARPS_Q6_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4687
+ nwarps = NWARPS_Q6_K_PASCAL;
4688
+ } else {
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4034
4694
  const dim3 block_nums(block_num_x, block_num_y, 1);
4035
- const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4036
4696
 
4037
- if (nrows_x % GGML_CUDA_MMQ_Y == 0) {
4038
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<false>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4039
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4040
4701
  } else {
4041
- mul_mat_q<QK_K, QR6_K, QI6_K, block_q6_K, allocate_tiles_q6_K, load_tiles_q6_K<true>, VDR_q6_K_q8_1, vec_dot_q6_K_q8_1_mul_mat>
4042
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4702
+ const bool need_check = true;
4703
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4043
4705
  }
4044
4706
  }
4045
4707
 
@@ -4214,20 +4876,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
4214
4876
  }
4215
4877
 
4216
4878
 
4217
- static void * g_scratch_buffer = nullptr;
4218
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
4219
- static size_t g_scratch_offset = 0;
4220
-
4221
- static int g_device_count = -1;
4222
- static int g_main_device = 0;
4223
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
4224
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
4225
- static bool g_mul_mat_q = false;
4226
-
4227
- static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
4228
-
4229
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
4230
-
4231
4879
  void ggml_init_cublas() {
4232
4880
  static bool initialized = false;
4233
4881
 
@@ -4583,6 +5231,37 @@ inline void ggml_cuda_op_mul_mat_q(
4583
5231
  (void) i1;
4584
5232
  }
4585
5233
 
5234
+ static int64_t get_row_rounding(ggml_type type) {
5235
+ int max_compute_capability = INT_MIN;
5236
+ for (int id = 0; id < g_device_count; ++id) {
5237
+ if (max_compute_capability < g_compute_capabilities[id]
5238
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5239
+ max_compute_capability = g_compute_capabilities[id];
5240
+ }
5241
+ }
5242
+
5243
+ switch(type) {
5244
+ case GGML_TYPE_Q4_0:
5245
+ case GGML_TYPE_Q4_1:
5246
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5247
+ case GGML_TYPE_Q5_0:
5248
+ case GGML_TYPE_Q5_1:
5249
+ case GGML_TYPE_Q8_0:
5250
+ return 64;
5251
+ case GGML_TYPE_F16:
5252
+ return 1;
5253
+ case GGML_TYPE_Q2_K:
5254
+ case GGML_TYPE_Q3_K:
5255
+ case GGML_TYPE_Q4_K:
5256
+ case GGML_TYPE_Q5_K:
5257
+ return max_compute_capability >= CC_TURING ? 128 : 64;
5258
+ case GGML_TYPE_Q6_K:
5259
+ return 64;
5260
+ default:
5261
+ GGML_ASSERT(false);
5262
+ }
5263
+ }
5264
+
4586
5265
  inline void ggml_cuda_op_mul_mat_vec(
4587
5266
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
4588
5267
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -4983,14 +5662,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
4983
5662
 
4984
5663
  int64_t row_low, row_high;
4985
5664
  if (split) {
5665
+ const int64_t rounding = get_row_rounding(src0->type);
5666
+
4986
5667
  row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
4987
- row_low -= row_low % GGML_CUDA_MMQ_Y;
5668
+ row_low -= row_low % rounding;
4988
5669
 
4989
5670
  if (id == g_device_count - 1) {
4990
5671
  row_high = nrows0;
4991
5672
  } else {
4992
5673
  row_high = nrows0*g_tensor_split[id + 1];
4993
- row_high -= row_high % GGML_CUDA_MMQ_Y;
5674
+ row_high -= row_high % rounding;
4994
5675
  }
4995
5676
  } else {
4996
5677
  row_low = 0;
@@ -5203,7 +5884,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
5203
5884
  if (split && g_device_count > 1) {
5204
5885
  CUDA_CHECK(cudaSetDevice(g_main_device));
5205
5886
  for (int id = 0; id < g_device_count; ++id) {
5206
- if (id != g_main_device) {
5887
+ if (id != g_main_device && src0_extra->events[id]) {
5207
5888
  CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
5208
5889
  }
5209
5890
  }
@@ -5347,7 +6028,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
5347
6028
  } else {
5348
6029
  int min_compute_capability = INT_MAX;
5349
6030
  for (int id = 0; id < g_device_count; ++id) {
5350
- if (min_compute_capability > g_compute_capabilities[id]) {
6031
+ if (min_compute_capability > g_compute_capabilities[id]
6032
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5351
6033
  min_compute_capability = g_compute_capabilities[id];
5352
6034
  }
5353
6035
  }
@@ -5468,14 +6150,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
5468
6150
  row_low = 0;
5469
6151
  row_high = nrows;
5470
6152
  } else if (backend == GGML_BACKEND_GPU_SPLIT) {
6153
+ const int64_t rounding = get_row_rounding(tensor->type);
6154
+
5471
6155
  row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
5472
- row_low -= row_low % GGML_CUDA_MMQ_Y;
6156
+ row_low -= row_low % rounding;
5473
6157
 
5474
6158
  if (id == g_device_count - 1) {
5475
6159
  row_high = nrows;
5476
6160
  } else {
5477
6161
  row_high = nrows*g_tensor_split[id + 1];
5478
- row_high -= row_high % GGML_CUDA_MMQ_Y;
6162
+ row_high -= row_high % rounding;
5479
6163
  }
5480
6164
  } else {
5481
6165
  GGML_ASSERT(false);
@@ -5785,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
5785
6469
  func(tensor->src[0], tensor->src[1], tensor);
5786
6470
  return true;
5787
6471
  }
6472
+
6473
+ int ggml_cuda_get_device_count() {
6474
+ int device_count;
6475
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6476
+ return device_count;
6477
+ }
6478
+
6479
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6480
+ cudaDeviceProp prop;
6481
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6482
+ snprintf(description, description_size, "%s", prop.name);
6483
+ }