llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ #include "ggml-cuda.h"
2
+ #include "ggml.h"
3
+ #include "ggml-backend-impl.h"
4
+
1
5
  #include <algorithm>
2
6
  #include <assert.h>
3
7
  #include <atomic>
@@ -54,6 +58,8 @@
54
58
  #define cudaDeviceProp hipDeviceProp_t
55
59
  #define cudaDeviceSynchronize hipDeviceSynchronize
56
60
  #define cudaError_t hipError_t
61
+ #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
62
+ #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
57
63
  #define cudaEventCreateWithFlags hipEventCreateWithFlags
58
64
  #define cudaEventDisableTiming hipEventDisableTiming
59
65
  #define cudaEventRecord hipEventRecord
@@ -119,11 +125,6 @@
119
125
 
120
126
  #endif // defined(GGML_USE_HIPBLAS)
121
127
 
122
- // ggml-cuda need half type so keep ggml headers include at last
123
- #include "ggml-cuda.h"
124
- #include "ggml.h"
125
- #include "ggml-backend-impl.h"
126
-
127
128
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
128
129
 
129
130
  #define CC_PASCAL 600
@@ -150,8 +151,8 @@
150
151
  #define CUDA_USE_TENSOR_CORES
151
152
  #endif
152
153
 
153
- // max batch size to use MMQ kernels when tensor cores are available
154
- #define MMQ_MAX_BATCH_SIZE 32
154
+ #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
155
+ #define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
155
156
 
156
157
  #if defined(GGML_USE_HIPBLAS)
157
158
  #define __CUDA_ARCH__ 1300
@@ -517,6 +518,24 @@ typedef struct {
517
518
  } block_iq3_xxs;
518
519
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
519
520
 
521
+ #define QR1_S 8
522
+ #define QI1_S (QK_K / (4*QR1_S))
523
+ typedef struct {
524
+ half d;
525
+ uint8_t qs[QK_K/8];
526
+ uint8_t scales[QK_K/16];
527
+ } block_iq1_s;
528
+ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
529
+
530
+ #define QK4_NL 32
531
+ #define QR4_NL 2
532
+ #define QI4_NL (QK4_NL / (4*QR4_NL))
533
+ typedef struct {
534
+ half d;
535
+ uint8_t qs[QK4_NL/2];
536
+ } block_iq4_nl;
537
+ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
538
+
520
539
  #define WARP_SIZE 32
521
540
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
522
541
 
@@ -642,18 +661,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
642
661
  return a;
643
662
  }
644
663
 
645
- static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
646
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
647
- #pragma unroll
648
- for (int mask = 16; mask > 0; mask >>= 1) {
649
- a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
650
- }
651
- return a;
652
- #else
653
- (void) a;
654
- NO_DEVICE_CODE;
655
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
656
- }
664
+ //static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
665
+ //#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
666
+ //#pragma unroll
667
+ // for (int mask = 16; mask > 0; mask >>= 1) {
668
+ // a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
669
+ // }
670
+ // return a;
671
+ //#else
672
+ // (void) a;
673
+ // NO_DEVICE_CODE;
674
+ //#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
675
+ //}
657
676
 
658
677
  static __device__ __forceinline__ float warp_reduce_max(float x) {
659
678
  #pragma unroll
@@ -663,18 +682,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
663
682
  return x;
664
683
  }
665
684
 
666
- static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
667
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
668
- #pragma unroll
669
- for (int mask = 16; mask > 0; mask >>= 1) {
670
- x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
671
- }
672
- return x;
673
- #else
674
- (void) x;
675
- NO_DEVICE_CODE;
676
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
677
- }
685
+ //static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
686
+ //#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
687
+ //#pragma unroll
688
+ // for (int mask = 16; mask > 0; mask >>= 1) {
689
+ // x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
690
+ // }
691
+ // return x;
692
+ //#else
693
+ // (void) x;
694
+ // NO_DEVICE_CODE;
695
+ //#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
696
+ //}
678
697
 
679
698
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
680
699
  return b;
@@ -1681,6 +1700,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
1681
1700
  0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
1682
1701
  };
1683
1702
 
1703
+ static const __device__ uint64_t iq1s_grid[512] = {
1704
+ 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
1705
+ 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
1706
+ 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
1707
+ 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
1708
+ 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
1709
+ 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
1710
+ 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
1711
+ 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
1712
+ 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
1713
+ 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
1714
+ 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
1715
+ 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
1716
+ 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
1717
+ 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
1718
+ 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
1719
+ 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
1720
+ 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
1721
+ 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
1722
+ 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
1723
+ 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
1724
+ 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
1725
+ 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
1726
+ 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
1727
+ 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
1728
+ 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
1729
+ 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
1730
+ 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
1731
+ 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
1732
+ 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
1733
+ 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
1734
+ 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
1735
+ 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
1736
+ 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
1737
+ 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
1738
+ 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
1739
+ 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
1740
+ 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
1741
+ 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
1742
+ 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
1743
+ 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
1744
+ 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
1745
+ 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
1746
+ 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
1747
+ 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
1748
+ 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
1749
+ 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
1750
+ 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
1751
+ 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
1752
+ 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
1753
+ 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
1754
+ 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
1755
+ 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
1756
+ 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
1757
+ 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
1758
+ 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
1759
+ 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
1760
+ 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
1761
+ 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
1762
+ 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
1763
+ 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
1764
+ 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
1765
+ 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
1766
+ 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
1767
+ 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
1768
+ 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
1769
+ 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
1770
+ 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
1771
+ 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
1772
+ 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
1773
+ 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
1774
+ 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
1775
+ 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
1776
+ 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
1777
+ 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
1778
+ 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
1779
+ 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
1780
+ 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
1781
+ 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
1782
+ 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
1783
+ 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
1784
+ 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
1785
+ 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
1786
+ 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
1787
+ 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
1788
+ 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
1789
+ 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
1790
+ 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
1791
+ 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
1792
+ 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
1793
+ 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
1794
+ 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
1795
+ 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
1796
+ 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
1797
+ 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
1798
+ 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
1799
+ 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
1800
+ 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
1801
+ 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
1802
+ 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
1803
+ 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
1804
+ 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
1805
+ 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
1806
+ 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
1807
+ 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
1808
+ 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
1809
+ 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
1810
+ 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
1811
+ 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
1812
+ 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
1813
+ 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
1814
+ 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
1815
+ 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
1816
+ 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
1817
+ 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
1818
+ 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
1819
+ 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
1820
+ 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
1821
+ 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
1822
+ 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
1823
+ 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
1824
+ 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
1825
+ 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
1826
+ 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
1827
+ 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
1828
+ 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
1829
+ 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
1830
+ 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
1831
+ 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
1832
+ };
1833
+
1684
1834
  static const __device__ uint8_t ksigns_iq2xs[128] = {
1685
1835
  0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
1686
1836
  144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
@@ -1823,6 +1973,49 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
1823
1973
 
1824
1974
  }
1825
1975
 
1976
+ template<typename dst_t>
1977
+ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1978
+
1979
+ const int i = blockIdx.x;
1980
+ const block_iq1_s * x = (const block_iq1_s *) vx;
1981
+
1982
+ const int tid = threadIdx.x;
1983
+ #if QK_K == 256
1984
+ const int il = tid/8; // 0...3
1985
+ const int ib = tid%8; // 0...7
1986
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1987
+ const int i8 = 4*ib+il;
1988
+ uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
1989
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
1990
+ const float d = (float)x[i].d * (2*(h & 7) + 1);
1991
+ for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
1992
+ #else
1993
+ assert(false);
1994
+ #endif
1995
+
1996
+ }
1997
+
1998
+ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
1999
+
2000
+ template<typename dst_t>
2001
+ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2002
+
2003
+ const int i = blockIdx.x;
2004
+ const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
2005
+
2006
+ const int tid = threadIdx.x;
2007
+ const int il = tid/8; // 0...3
2008
+ const int ib = tid%8; // 0...7
2009
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
2010
+ const uint8_t * q4 = x[ib].qs + 4*il;
2011
+ const float d = (float)x[ib].d;
2012
+ for (int j = 0; j < 4; ++j) {
2013
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
2014
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
2015
+ }
2016
+
2017
+ }
2018
+
1826
2019
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1827
2020
 
1828
2021
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -4478,10 +4671,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4478
4671
  const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
4479
4672
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
4480
4673
  #else
4674
+ (void) ksigns64;
4481
4675
  assert(false);
4482
4676
  return 0.f;
4483
4677
  #endif
4484
4678
  #else
4679
+ (void) ksigns64;
4485
4680
  assert(false);
4486
4681
  return 0.f;
4487
4682
  #endif
@@ -4522,6 +4717,99 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
4522
4717
  #endif
4523
4718
  }
4524
4719
 
4720
+ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
4721
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4722
+ #if QK_K == 256
4723
+ const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
4724
+
4725
+ const int ib32 = iqs;
4726
+ int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
4727
+ const uint8_t h1 = bq1->scales[2*ib32+0];
4728
+ const uint8_t h2 = bq1->scales[2*ib32+1];
4729
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4730
+ const int * q8 = (const int *)bq8_1[ib32].qs;
4731
+ const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
4732
+ const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
4733
+ const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
4734
+ const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
4735
+ for (int j = 0; j < 2; ++j) {
4736
+ sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
4737
+ sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
4738
+ sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
4739
+ sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
4740
+ }
4741
+ #else
4742
+ const int8_t * q8 = bq8_1[ib32].qs;
4743
+ const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
4744
+ const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
4745
+ const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
4746
+ const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
4747
+ for (int j = 0; j < 8; ++j) {
4748
+ sumi1 += q8[j+ 0] * grid1[j];
4749
+ sumi2 += q8[j+ 8] * grid2[j];
4750
+ sumi3 += q8[j+16] * grid3[j];
4751
+ sumi4 += q8[j+24] * grid4[j];
4752
+ }
4753
+ #endif
4754
+ const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
4755
+ return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
4756
+ sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
4757
+ #else
4758
+ assert(false);
4759
+ return 0.f;
4760
+ #endif
4761
+ }
4762
+
4763
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4764
+ static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
4765
+ int & val1, int & val2) {
4766
+
4767
+ uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
4768
+ aux32 = q4 & 0x0f0f0f0f;
4769
+ uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
4770
+ uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
4771
+ val1 = v1 | (v2 << 16);
4772
+ aux32 = (q4 >> 4) & 0x0f0f0f0f;
4773
+ v1 = values[q8[0]] | (values[q8[1]] << 8);
4774
+ v2 = values[q8[2]] | (values[q8[3]] << 8);
4775
+ val2 = v1 | (v2 << 16);
4776
+ }
4777
+ #endif
4778
+
4779
+ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
4780
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4781
+
4782
+ const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
4783
+
4784
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4785
+ const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
4786
+ const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
4787
+
4788
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
4789
+
4790
+ int v1, v2;
4791
+ int sumi1 = 0, sumi2 = 0;
4792
+ for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
4793
+ const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
4794
+ get_int_from_table_16(aux, values, v1, v2);
4795
+ sumi1 = __dp4a(v1, q8[l+0], sumi1);
4796
+ sumi2 = __dp4a(v2, q8[l+4], sumi2);
4797
+ }
4798
+
4799
+ #else
4800
+ const uint8_t * q4 = bq->qs + 4*iqs;
4801
+ const int8_t * q8 = bq8_1->qs + 4*iqs;
4802
+
4803
+ int sumi1 = 0, sumi2 = 0;
4804
+ for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
4805
+ sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
4806
+ sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
4807
+ }
4808
+ #endif
4809
+ const float d = (float)bq->d * __low2float(bq8_1->ds);
4810
+ return d * (sumi1 + sumi2);
4811
+ }
4812
+
4525
4813
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
4526
4814
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
4527
4815
  static __device__ __forceinline__ void mul_mat_q(
@@ -5310,51 +5598,59 @@ template <bool need_check> static __global__ void
5310
5598
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5311
5599
  }
5312
5600
 
5313
- #define MMVQ_NWARPS_NVIDIA 4
5314
- #define MMVQ_NWARPS_AMD_RDNA2 1
5315
- #define MMVQ_NWARPS_AMD_OLD 4
5316
-
5317
- template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
5601
+ template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
5318
5602
  #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
5319
- __launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
5603
+ // tell the compiler to use as many registers as it wants, see nwarps definition below
5604
+ __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
5320
5605
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
5321
5606
  static __global__ void mul_mat_vec_q(
5322
5607
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
5323
- const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
5324
-
5325
- const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
5608
+ const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
5326
5609
 
5327
- const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
5328
- const int row = blockIdx.x;
5610
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
5611
+ constexpr int nwarps = 1;
5612
+ constexpr int rows_per_cuda_block = 1;
5613
+ #else
5614
+ constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
5615
+ constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
5616
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
5329
5617
 
5330
- const int blocks_per_row_x = ncols_x / qk;
5331
- const int blocks_per_col_y = nrows_y / QK8_1;
5332
- const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
5618
+ const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
5619
+ const int row0 = rows_per_cuda_block*blockIdx.x;
5620
+ const int blocks_per_row_x = ncols_x / qk;
5621
+ const int blocks_per_col_y = nrows_y / QK8_1;
5622
+ constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
5333
5623
 
5334
5624
  // partial sum for each thread
5335
- float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
5625
+ float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
5336
5626
 
5337
5627
  const block_q_t * x = (const block_q_t *) vx;
5338
5628
  const block_q8_1 * y = (const block_q8_1 *) vy;
5339
5629
 
5340
- for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
5341
- const int ibx = row*blocks_per_row_x + i; // x block index
5342
-
5343
- const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
5630
+ for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
5631
+ const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
5344
5632
 
5345
- const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
5633
+ // x block quant index when casting the quants to int
5634
+ const int kqs = vdr * (tid % (qi/vdr));
5346
5635
 
5347
5636
  #pragma unroll
5348
5637
  for (int j = 0; j < ncols_y; ++j) {
5349
- tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
5638
+ #pragma unroll
5639
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
5640
+ tmp[j][i] += vec_dot_q_cuda(
5641
+ &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
5642
+ }
5350
5643
  }
5351
5644
  }
5352
5645
 
5353
- __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
5646
+ __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
5354
5647
  if (threadIdx.y > 0) {
5355
5648
  #pragma unroll
5356
5649
  for (int j = 0; j < ncols_y; ++j) {
5357
- tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
5650
+ #pragma unroll
5651
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
5652
+ tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
5653
+ }
5358
5654
  }
5359
5655
  }
5360
5656
  __syncthreads();
@@ -5366,13 +5662,16 @@ static __global__ void mul_mat_vec_q(
5366
5662
  #pragma unroll
5367
5663
  for (int j = 0; j < ncols_y; ++j) {
5368
5664
  #pragma unroll
5369
- for (int i = 0; i < nwarps-1; ++i) {
5370
- tmp[j] += tmp_shared[i][j][threadIdx.x];
5665
+ for (int i = 0; i < rows_per_cuda_block; ++i) {
5666
+ #pragma unroll
5667
+ for (int l = 0; l < nwarps-1; ++l) {
5668
+ tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
5669
+ }
5670
+ tmp[j][i] = warp_reduce_sum(tmp[j][i]);
5371
5671
  }
5372
- tmp[j] = warp_reduce_sum(tmp[j]);
5373
5672
 
5374
- if (threadIdx.x == 0) {
5375
- dst[j*nrows_dst + row] = tmp[j];
5673
+ if (threadIdx.x < rows_per_cuda_block) {
5674
+ dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
5376
5675
  }
5377
5676
  }
5378
5677
  }
@@ -5945,149 +6244,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
5945
6244
  dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
5946
6245
  }
5947
6246
 
5948
- template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
5949
- static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
5950
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5951
- const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
5952
- const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
6247
+ template <bool vals_smem, int ncols_template, int block_size_template>
6248
+ static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
6249
+ const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
5953
6250
 
5954
6251
  const int tid = threadIdx.x;
5955
6252
  const int rowx = blockIdx.x;
5956
- const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
6253
+ const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
5957
6254
 
5958
6255
  const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
5959
6256
 
5960
6257
  const int warp_id = threadIdx.x / WARP_SIZE;
5961
6258
  const int lane_id = threadIdx.x % WARP_SIZE;
5962
6259
 
5963
- extern __shared__ half data_soft_max_f16[];
5964
- half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
5965
- // (shared memory) buffer to cache values between iterations:
5966
- half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
5967
- // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
5968
- // in that case col_smem == col_data must be enforced to avoid race conditions
6260
+ float slope = 0.0f;
5969
6261
 
5970
- half2 max_val = make_half2(-INFINITY, -INFINITY);
6262
+ // ALiBi
6263
+ if (max_bias > 0.0f) {
6264
+ const int h = rowx/nrows_y; // head index
5971
6265
 
5972
- #pragma unroll
5973
- for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
5974
- const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
5975
- const int col_smem = vals_smem ? col0 + tid : col_data;
6266
+ const float base = h < n_head_log2 ? m0 : m1;
6267
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
5976
6268
 
5977
- const int ix = rowx*ncols_data + col_data;
5978
- const int iy = rowy*ncols_data + col_data;
5979
-
5980
- half2 val;
5981
- if (need_check && col_data + 0 >= ncols_data) {
5982
- val.x = -INFINITY;
5983
- } else {
5984
- val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
5985
- }
5986
- if (need_check && col_data + WARP_SIZE >= ncols_data) {
5987
- val.y = -INFINITY;
5988
- } else {
5989
- val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
5990
- }
5991
- if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
5992
- vals[col_smem] = val;
5993
- }
5994
- max_val = __hmax2(max_val, val);
6269
+ slope = powf(base, exp);
5995
6270
  }
5996
6271
 
5997
- // find the max value in the block
5998
- max_val = warp_reduce_max(max_val);
5999
- if (block_size > WARP_SIZE) {
6000
- if (warp_id == 0) {
6001
- buf_iw[lane_id] = -INFINITY;
6002
- }
6003
- __syncthreads();
6004
-
6005
- if (lane_id == 0) {
6006
- buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
6007
- }
6008
- __syncthreads();
6009
-
6010
- max_val = __half2half2(buf_iw[lane_id]);
6011
- max_val = warp_reduce_max(max_val);
6012
- } else {
6013
- max_val = __half2half2(__hmax(max_val.x, max_val.y));
6014
- }
6015
-
6016
- half2 tmp = make_half2(0.0f, 0.0f); // partial sums
6017
-
6018
- #pragma unroll
6019
- for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
6020
- const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
6021
-
6022
- if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
6023
- break;
6024
- }
6025
-
6026
- const half2 val = h2exp(vals[col_smem] - max_val);
6027
-
6028
- tmp += val;
6029
- vals[col_smem] = val;
6030
- }
6031
-
6032
- // find the sum of exps in the block
6033
- tmp = warp_reduce_sum(tmp);
6034
- if (block_size > WARP_SIZE) {
6035
- if (warp_id == 0) {
6036
- buf_iw[lane_id] = 0.0f;
6037
- }
6038
- __syncthreads();
6039
-
6040
- if (lane_id == 0) {
6041
- buf_iw[warp_id] = tmp.x + tmp.y;
6042
- }
6043
- __syncthreads();
6044
-
6045
- tmp = __half2half2(buf_iw[lane_id]);
6046
- tmp = warp_reduce_sum(tmp);
6047
- } else {
6048
- tmp = __half2half2(tmp.x + tmp.y);
6049
- }
6050
-
6051
- const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
6052
-
6053
- #pragma unroll
6054
- for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
6055
- const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
6056
- const int col_smem = vals_smem ? col0 + tid : col_data;
6057
-
6058
- const int idst = rowx*ncols_data + col_data;
6059
- const half2 result = vals[col_smem] * inv_sum;
6060
-
6061
- if (need_check && col_data + 0 >= ncols_data) {
6062
- return;
6063
- }
6064
- dst[idst] = result.x;
6065
-
6066
- if (need_check && col_data + WARP_SIZE >= ncols_data) {
6067
- return;
6068
- }
6069
-
6070
- dst[idst + WARP_SIZE] = result.y;
6071
- }
6072
- #else
6073
- (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
6074
- NO_DEVICE_CODE;
6075
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
6076
- }
6077
-
6078
- template <bool vals_smem, int ncols_template, int block_size_template>
6079
- static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
6080
- const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
6081
-
6082
- const int tid = threadIdx.x;
6083
- const int rowx = blockIdx.x;
6084
- const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
6085
-
6086
- const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
6087
-
6088
- const int warp_id = threadIdx.x / WARP_SIZE;
6089
- const int lane_id = threadIdx.x % WARP_SIZE;
6090
-
6091
6272
  extern __shared__ float data_soft_max_f32[];
6092
6273
  float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
6093
6274
  // shared memory buffer to cache values between iterations:
@@ -6106,7 +6287,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
6106
6287
  const int ix = rowx*ncols + col;
6107
6288
  const int iy = rowy*ncols + col;
6108
6289
 
6109
- const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
6290
+ const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
6291
+
6110
6292
  vals[col] = val;
6111
6293
  max_val = max(max_val, val);
6112
6294
  }
@@ -6667,6 +6849,18 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
6667
6849
  dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
6668
6850
  }
6669
6851
 
6852
+ template<typename dst_t>
6853
+ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6854
+ const int nb = k / QK_K;
6855
+ dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
6856
+ }
6857
+
6858
+ template<typename dst_t>
6859
+ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6860
+ const int nb = (k + QK_K - 1) / QK_K;
6861
+ dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
6862
+ }
6863
+
6670
6864
  template <typename src_t, typename dst_t>
6671
6865
  static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
6672
6866
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6706,6 +6900,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6706
6900
  return dequantize_row_iq2_xs_cuda;
6707
6901
  case GGML_TYPE_IQ3_XXS:
6708
6902
  return dequantize_row_iq3_xxs_cuda;
6903
+ case GGML_TYPE_IQ1_S:
6904
+ return dequantize_row_iq1_s_cuda;
6905
+ case GGML_TYPE_IQ4_NL:
6906
+ return dequantize_row_iq4_nl_cuda;
6709
6907
  case GGML_TYPE_F32:
6710
6908
  return convert_unary_cuda<float>;
6711
6909
  default:
@@ -6741,6 +6939,10 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6741
6939
  return dequantize_row_iq2_xs_cuda;
6742
6940
  case GGML_TYPE_IQ3_XXS:
6743
6941
  return dequantize_row_iq3_xxs_cuda;
6942
+ case GGML_TYPE_IQ1_S:
6943
+ return dequantize_row_iq1_s_cuda;
6944
+ case GGML_TYPE_IQ4_NL:
6945
+ return dequantize_row_iq4_nl_cuda;
6744
6946
  case GGML_TYPE_F16:
6745
6947
  return convert_unary_cuda<half>;
6746
6948
  default:
@@ -6851,65 +7053,75 @@ static void mul_mat_vec_q_cuda(
6851
7053
  const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
6852
7054
 
6853
7055
  GGML_ASSERT(ncols_x % qk == 0);
6854
- GGML_ASSERT(ncols_y <= 4);
7056
+ GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
6855
7057
 
6856
7058
  int id;
6857
7059
  CUDA_CHECK(cudaGetDevice(&id));
6858
7060
 
6859
- int nwarps;
6860
- if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
6861
- nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
6862
- } else {
6863
- nwarps = MMVQ_NWARPS_NVIDIA;
6864
- }
7061
+ int64_t nwarps = 1;
7062
+ int64_t rows_per_cuda_block = 1;
6865
7063
 
6866
- const dim3 block_nums(nrows_x, 1, 1);
6867
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
6868
-
6869
- switch (nwarps) {
6870
- case 1: switch(ncols_y) {
7064
+ if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
7065
+ switch(ncols_y) {
6871
7066
  case 1:
6872
- mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
6873
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
7067
+ nwarps = 4;
7068
+ rows_per_cuda_block = 1;
6874
7069
  break;
6875
7070
  case 2:
6876
- mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
6877
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6878
- break;
6879
7071
  case 3:
6880
- mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
6881
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6882
- break;
6883
7072
  case 4:
6884
- mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
6885
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
7073
+ nwarps = 4;
7074
+ rows_per_cuda_block = 2;
6886
7075
  break;
6887
- default:
6888
- GGML_ASSERT(false);
6889
- break;
6890
- } break;
6891
- case 4: switch(ncols_y) {
6892
- case 1:
6893
- mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
6894
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6895
- break;
6896
- case 2:
6897
- mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
6898
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6899
- break;
6900
- case 3:
6901
- mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
6902
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6903
- break;
6904
- case 4:
6905
- mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
6906
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
7076
+ case 5:
7077
+ case 6:
7078
+ case 7:
7079
+ case 8:
7080
+ nwarps = 2;
7081
+ rows_per_cuda_block = 2;
6907
7082
  break;
6908
7083
  default:
6909
7084
  GGML_ASSERT(false);
6910
7085
  break;
6911
- } break;
7086
+ }
7087
+ }
7088
+ const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
7089
+ const dim3 block_nums(nblocks, 1, 1);
7090
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
6912
7091
 
7092
+ switch (ncols_y) {
7093
+ case 1:
7094
+ mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
7095
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7096
+ break;
7097
+ case 2:
7098
+ mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
7099
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7100
+ break;
7101
+ case 3:
7102
+ mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
7103
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7104
+ break;
7105
+ case 4:
7106
+ mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
7107
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7108
+ break;
7109
+ case 5:
7110
+ mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
7111
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7112
+ break;
7113
+ case 6:
7114
+ mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
7115
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7116
+ break;
7117
+ case 7:
7118
+ mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
7119
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7120
+ break;
7121
+ case 8:
7122
+ mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
7123
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
7124
+ break;
6913
7125
  default:
6914
7126
  GGML_ASSERT(false);
6915
7127
  break;
@@ -7568,89 +7780,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
7568
7780
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
7569
7781
  }
7570
7782
 
7571
- static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
7572
- int nth = WARP_SIZE;
7573
- while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
7574
- const dim3 block_dims(nth, 1, 1);
7575
- const dim3 block_nums(nrows_x, 1, 1);
7576
- const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
7577
- static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
7578
- if (shmem <= g_device_caps[g_main_device].smpb) {
7579
- switch (ncols_x) {
7580
- case 32:
7581
- soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7582
- break;
7583
- case 64:
7584
- soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7585
- break;
7586
- case 128:
7587
- soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7588
- break;
7589
- case 256:
7590
- soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7591
- break;
7592
- case 512:
7593
- soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7594
- break;
7595
- case 1024:
7596
- soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7597
- break;
7598
- case 2048:
7599
- soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7600
- break;
7601
- case 4096:
7602
- soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7603
- break;
7604
- default:
7605
- soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7606
- break;
7607
- }
7608
- } else {
7609
- const size_t shmem_low = WARP_SIZE*sizeof(half);
7610
- soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7611
- }
7612
- }
7613
-
7614
- static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
7783
+ static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
7615
7784
  int nth = WARP_SIZE;
7616
7785
  while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
7617
7786
  const dim3 block_dims(nth, 1, 1);
7618
7787
  const dim3 block_nums(nrows_x, 1, 1);
7619
7788
  const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
7620
7789
  static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
7790
+
7791
+ const uint32_t n_head_kv = nrows_x/nrows_y;
7792
+ const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
7793
+
7794
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
7795
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
7796
+
7621
7797
  if (shmem < g_device_caps[g_main_device].smpb) {
7622
7798
  switch (ncols_x) {
7623
7799
  case 32:
7624
- soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7800
+ soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7625
7801
  break;
7626
7802
  case 64:
7627
- soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7803
+ soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7628
7804
  break;
7629
7805
  case 128:
7630
- soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7806
+ soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7631
7807
  break;
7632
7808
  case 256:
7633
- soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7809
+ soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7634
7810
  break;
7635
7811
  case 512:
7636
- soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7812
+ soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7637
7813
  break;
7638
7814
  case 1024:
7639
- soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7815
+ soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7640
7816
  break;
7641
7817
  case 2048:
7642
- soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7818
+ soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7643
7819
  break;
7644
7820
  case 4096:
7645
- soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7821
+ soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7646
7822
  break;
7647
7823
  default:
7648
- soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7824
+ soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7649
7825
  break;
7650
7826
  }
7651
7827
  } else {
7652
7828
  const size_t shmem_low = WARP_SIZE*sizeof(float);
7653
- soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
7829
+ soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
7654
7830
  }
7655
7831
  }
7656
7832
 
@@ -7922,6 +8098,7 @@ GGML_CALL void ggml_init_cublas() {
7922
8098
  if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
7923
8099
  initialized = true;
7924
8100
  g_cublas_loaded = false;
8101
+ fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
7925
8102
  return;
7926
8103
  }
7927
8104
 
@@ -8509,6 +8686,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8509
8686
  case GGML_TYPE_IQ2_XXS:
8510
8687
  case GGML_TYPE_IQ2_XS:
8511
8688
  case GGML_TYPE_IQ3_XXS:
8689
+ case GGML_TYPE_IQ1_S:
8690
+ case GGML_TYPE_IQ4_NL:
8512
8691
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
8513
8692
  default:
8514
8693
  GGML_ASSERT(false);
@@ -8532,6 +8711,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8532
8711
  case GGML_TYPE_IQ2_XXS:
8533
8712
  case GGML_TYPE_IQ2_XS:
8534
8713
  case GGML_TYPE_IQ3_XXS:
8714
+ case GGML_TYPE_IQ1_S:
8715
+ case GGML_TYPE_IQ4_NL:
8535
8716
  return max_compute_capability >= CC_VOLTA ? 128 : 64;
8536
8717
  case GGML_TYPE_Q6_K:
8537
8718
  return 64;
@@ -8629,6 +8810,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
8629
8810
  mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
8630
8811
  (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8631
8812
  break;
8813
+ case GGML_TYPE_IQ1_S:
8814
+ mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
8815
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8816
+ break;
8817
+ case GGML_TYPE_IQ4_NL:
8818
+ mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
8819
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8820
+ break;
8632
8821
  default:
8633
8822
  GGML_ASSERT(false);
8634
8823
  break;
@@ -9068,30 +9257,36 @@ static void ggml_cuda_op_soft_max(
9068
9257
 
9069
9258
  GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
9070
9259
 
9071
- const int64_t ne00 = src0->ne[0];
9260
+ const int64_t ne00 = src0->ne[0];
9072
9261
  const int64_t nrows_x = ggml_nrows(src0);
9073
- const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
9262
+ const int64_t nrows_y = src0->ne[1];
9074
9263
 
9075
- float scale = 1.0f;
9076
- memcpy(&scale, dst->op_params, sizeof(float));
9264
+ float scale = 1.0f;
9265
+ float max_bias = 0.0f;
9077
9266
 
9078
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
9079
- #ifdef GGML_CUDA_F16
9080
- const bool use_f16_soft_max = true;
9081
- #else
9082
- const bool use_f16_soft_max = false;
9083
- #endif // GGML_CUDA_F16
9084
- #else
9085
- const bool use_f16_soft_max = false;
9086
- #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
9267
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
9268
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
9087
9269
 
9088
- if (use_f16_soft_max) {
9089
- soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
9090
- } else {
9091
- soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
9270
+ // positions tensor
9271
+ float * src2_dd = nullptr;
9272
+ cuda_pool_alloc<float> src2_f;
9273
+
9274
+ ggml_tensor * src2 = dst->src[2];
9275
+ const bool use_src2 = src2 != nullptr;
9276
+
9277
+ if (use_src2) {
9278
+ const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
9279
+
9280
+ if (src2_on_device) {
9281
+ ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
9282
+ src2_dd = (float *) src2_extra->data_device[g_main_device];
9283
+ } else {
9284
+ src2_dd = src2_f.alloc(ggml_nelements(src2));
9285
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
9286
+ }
9092
9287
  }
9093
9288
 
9094
- (void) dst;
9289
+ soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
9095
9290
  }
9096
9291
 
9097
9292
  static void ggml_cuda_op_scale(
@@ -9226,9 +9421,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
9226
9421
  CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
9227
9422
  if (can_access_peer) {
9228
9423
  if (enable_peer_access) {
9229
- CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
9424
+ cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
9425
+ if (err != cudaErrorPeerAccessAlreadyEnabled) {
9426
+ CUDA_CHECK(err);
9427
+ }
9230
9428
  } else {
9231
- CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
9429
+ cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
9430
+ if (err != cudaErrorPeerAccessNotEnabled) {
9431
+ CUDA_CHECK(err);
9432
+ }
9232
9433
  }
9233
9434
  }
9234
9435
  }
@@ -9735,7 +9936,7 @@ static __global__ void k_compute_batched_ptrs(
9735
9936
  ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
9736
9937
  }
9737
9938
 
9738
- static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9939
+ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9739
9940
  GGML_ASSERT(!ggml_is_transposed(src0));
9740
9941
  GGML_ASSERT(!ggml_is_transposed(src1));
9741
9942
 
@@ -9893,39 +10094,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9893
10094
 
9894
10095
  int64_t min_compute_capability = INT_MAX;
9895
10096
 
10097
+ bool any_pascal_with_slow_fp16 = false;
9896
10098
  if (split) {
9897
10099
  ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9898
10100
  auto & tensor_split = buft_ctx->tensor_split;
9899
10101
  for (int id = 0; id < g_device_count; ++id) {
9900
- if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
10102
+ // skip devices that are not going to do any work:
10103
+ if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
10104
+ continue;
10105
+ }
10106
+
10107
+ if (min_compute_capability > g_device_caps[id].cc) {
9901
10108
  min_compute_capability = g_device_caps[id].cc;
9902
10109
  }
10110
+ if (g_device_caps[id].cc == 610) {
10111
+ any_pascal_with_slow_fp16 = true;
10112
+ }
9903
10113
  }
9904
10114
  } else {
9905
- min_compute_capability = g_device_caps[g_main_device].cc;
10115
+ min_compute_capability = g_device_caps[g_main_device].cc;
10116
+ any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
9906
10117
  }
9907
10118
 
10119
+ // check data types and tensor shapes for custom matrix multiplication kernels:
10120
+ bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
10121
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
10122
+ && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
10123
+
10124
+ bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
10125
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
10126
+ && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
10127
+
10128
+ bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
10129
+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
10130
+
9908
10131
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
9909
10132
 
9910
10133
  const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
9911
- bool use_mul_mat_q = ggml_is_quantized(src0->type);
10134
+
9912
10135
  #ifdef CUDA_USE_TENSOR_CORES
9913
10136
  use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
9914
10137
  #endif // CUDA_USE_TENSOR_CORES
9915
10138
 
9916
10139
  #else
9917
10140
 
9918
- const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
9919
- bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
10141
+ // fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
10142
+ const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
10143
+
10144
+ // mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
10145
+ use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
10146
+ use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
10147
+
9920
10148
  #ifdef CUDA_USE_TENSOR_CORES
9921
10149
  // when tensor cores are available, use them for large batch size
9922
10150
  // ref: https://github.com/ggerganov/llama.cpp/pull/3776
9923
- use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
10151
+ use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
9924
10152
  #endif // CUDA_USE_TENSOR_CORES
9925
10153
 
9926
10154
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
9927
10155
 
9928
- use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
10156
+ // if mmvq is available it's a better choice than dmmv:
10157
+ #ifndef GGML_CUDA_FORCE_DMMV
10158
+ use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
10159
+ #endif // GGML_CUDA_FORCE_DMMV
9929
10160
 
9930
10161
  // debug helpers
9931
10162
  //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@@ -9943,33 +10174,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9943
10174
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
9944
10175
  } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
9945
10176
  // KQ + KQV multi-batch
9946
- ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
9947
- } else if (src0->type == GGML_TYPE_F32) {
9948
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
9949
- } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
9950
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
9951
- #ifdef GGML_CUDA_FORCE_DMMV
9952
- const bool use_mul_mat_vec_q = false;
9953
- #else
9954
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
9955
- #endif // GGML_CUDA_FORCE_DMMV
9956
-
9957
- if (use_mul_mat_vec_q) {
9958
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
9959
- } else {
9960
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
9961
- }
9962
- } else {
9963
- if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
9964
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
9965
- } else if (use_mul_mat_q) {
9966
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
9967
- } else {
9968
- ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
9969
- }
9970
- }
10177
+ ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
10178
+ } else if (use_dequantize_mul_mat_vec) {
10179
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
10180
+ } else if (use_mul_mat_vec_q) {
10181
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
10182
+ } else if (use_mul_mat_q) {
10183
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
9971
10184
  } else {
9972
- GGML_ASSERT(false);
10185
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
9973
10186
  }
9974
10187
  }
9975
10188
 
@@ -10888,10 +11101,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
10888
11101
  UNUSED(buffer);
10889
11102
  }
10890
11103
 
10891
- // unused at the moment
10892
- //static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
10893
- // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10894
- //}
11104
+ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
11105
+ return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
11106
+ UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
11107
+ }
10895
11108
 
10896
11109
  GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10897
11110
  ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -11279,7 +11492,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
11279
11492
  for (int j = 0; j < GGML_MAX_SRC; j++) {
11280
11493
  if (node->src[j] != nullptr) {
11281
11494
  assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
11282
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
11495
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
11283
11496
  assert(node->src[j]->extra != nullptr);
11284
11497
  }
11285
11498
  }
@@ -11327,7 +11540,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
11327
11540
  return false;
11328
11541
  }
11329
11542
  ggml_type a_type = a->type;
11330
- if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
11543
+ if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
11544
+ a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
11331
11545
  if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
11332
11546
  return false;
11333
11547
  }