llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -1,3 +1,7 @@
|
|
1
|
+
#include "ggml-cuda.h"
|
2
|
+
#include "ggml.h"
|
3
|
+
#include "ggml-backend-impl.h"
|
4
|
+
|
1
5
|
#include <algorithm>
|
2
6
|
#include <assert.h>
|
3
7
|
#include <atomic>
|
@@ -54,6 +58,8 @@
|
|
54
58
|
#define cudaDeviceProp hipDeviceProp_t
|
55
59
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
56
60
|
#define cudaError_t hipError_t
|
61
|
+
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
62
|
+
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
57
63
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
58
64
|
#define cudaEventDisableTiming hipEventDisableTiming
|
59
65
|
#define cudaEventRecord hipEventRecord
|
@@ -119,11 +125,6 @@
|
|
119
125
|
|
120
126
|
#endif // defined(GGML_USE_HIPBLAS)
|
121
127
|
|
122
|
-
// ggml-cuda need half type so keep ggml headers include at last
|
123
|
-
#include "ggml-cuda.h"
|
124
|
-
#include "ggml.h"
|
125
|
-
#include "ggml-backend-impl.h"
|
126
|
-
|
127
128
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
128
129
|
|
129
130
|
#define CC_PASCAL 600
|
@@ -150,8 +151,8 @@
|
|
150
151
|
#define CUDA_USE_TENSOR_CORES
|
151
152
|
#endif
|
152
153
|
|
153
|
-
// max batch size to use
|
154
|
-
#define
|
154
|
+
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
155
|
+
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
155
156
|
|
156
157
|
#if defined(GGML_USE_HIPBLAS)
|
157
158
|
#define __CUDA_ARCH__ 1300
|
@@ -517,6 +518,24 @@ typedef struct {
|
|
517
518
|
} block_iq3_xxs;
|
518
519
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
519
520
|
|
521
|
+
#define QR1_S 8
|
522
|
+
#define QI1_S (QK_K / (4*QR1_S))
|
523
|
+
typedef struct {
|
524
|
+
half d;
|
525
|
+
uint8_t qs[QK_K/8];
|
526
|
+
uint8_t scales[QK_K/16];
|
527
|
+
} block_iq1_s;
|
528
|
+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
529
|
+
|
530
|
+
#define QK4_NL 32
|
531
|
+
#define QR4_NL 2
|
532
|
+
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
533
|
+
typedef struct {
|
534
|
+
half d;
|
535
|
+
uint8_t qs[QK4_NL/2];
|
536
|
+
} block_iq4_nl;
|
537
|
+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
538
|
+
|
520
539
|
#define WARP_SIZE 32
|
521
540
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
522
541
|
|
@@ -642,18 +661,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
642
661
|
return a;
|
643
662
|
}
|
644
663
|
|
645
|
-
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
646
|
-
|
647
|
-
|
648
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
649
|
-
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
650
|
-
}
|
651
|
-
return a;
|
652
|
-
|
653
|
-
(void) a;
|
654
|
-
NO_DEVICE_CODE;
|
655
|
-
|
656
|
-
}
|
664
|
+
//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
665
|
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
666
|
+
//#pragma unroll
|
667
|
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
668
|
+
// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
669
|
+
// }
|
670
|
+
// return a;
|
671
|
+
//#else
|
672
|
+
// (void) a;
|
673
|
+
// NO_DEVICE_CODE;
|
674
|
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
675
|
+
//}
|
657
676
|
|
658
677
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
659
678
|
#pragma unroll
|
@@ -663,18 +682,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
663
682
|
return x;
|
664
683
|
}
|
665
684
|
|
666
|
-
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
667
|
-
|
668
|
-
|
669
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
670
|
-
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
671
|
-
}
|
672
|
-
return x;
|
673
|
-
|
674
|
-
(void) x;
|
675
|
-
NO_DEVICE_CODE;
|
676
|
-
|
677
|
-
}
|
685
|
+
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
686
|
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
687
|
+
//#pragma unroll
|
688
|
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
689
|
+
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
690
|
+
// }
|
691
|
+
// return x;
|
692
|
+
//#else
|
693
|
+
// (void) x;
|
694
|
+
// NO_DEVICE_CODE;
|
695
|
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
696
|
+
//}
|
678
697
|
|
679
698
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
680
699
|
return b;
|
@@ -1681,6 +1700,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
|
|
1681
1700
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
1682
1701
|
};
|
1683
1702
|
|
1703
|
+
static const __device__ uint64_t iq1s_grid[512] = {
|
1704
|
+
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
1705
|
+
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
1706
|
+
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
1707
|
+
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
1708
|
+
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
1709
|
+
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
1710
|
+
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
1711
|
+
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
1712
|
+
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
1713
|
+
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
1714
|
+
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
1715
|
+
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
1716
|
+
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
1717
|
+
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
1718
|
+
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
1719
|
+
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
1720
|
+
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
1721
|
+
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
1722
|
+
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
1723
|
+
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
1724
|
+
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
1725
|
+
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
1726
|
+
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
1727
|
+
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
1728
|
+
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
1729
|
+
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
1730
|
+
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
1731
|
+
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
1732
|
+
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
1733
|
+
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
1734
|
+
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
1735
|
+
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
1736
|
+
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
1737
|
+
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
1738
|
+
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
1739
|
+
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
1740
|
+
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
1741
|
+
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
1742
|
+
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
1743
|
+
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
1744
|
+
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
1745
|
+
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
1746
|
+
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
1747
|
+
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
1748
|
+
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
1749
|
+
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
1750
|
+
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
1751
|
+
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
1752
|
+
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
1753
|
+
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
1754
|
+
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
1755
|
+
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
1756
|
+
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
1757
|
+
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
1758
|
+
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
1759
|
+
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
1760
|
+
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
1761
|
+
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
1762
|
+
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
1763
|
+
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
1764
|
+
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
1765
|
+
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
1766
|
+
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
1767
|
+
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
1768
|
+
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
1769
|
+
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
1770
|
+
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
1771
|
+
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
1772
|
+
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
1773
|
+
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
1774
|
+
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
1775
|
+
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
1776
|
+
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
1777
|
+
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
1778
|
+
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
1779
|
+
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
1780
|
+
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
1781
|
+
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
1782
|
+
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
1783
|
+
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
1784
|
+
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
1785
|
+
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
1786
|
+
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
1787
|
+
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
1788
|
+
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
1789
|
+
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
1790
|
+
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
1791
|
+
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
1792
|
+
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
1793
|
+
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
1794
|
+
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
1795
|
+
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
1796
|
+
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
1797
|
+
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
1798
|
+
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
1799
|
+
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
1800
|
+
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
1801
|
+
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
1802
|
+
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
1803
|
+
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
1804
|
+
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
1805
|
+
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
1806
|
+
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
1807
|
+
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
1808
|
+
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
1809
|
+
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
1810
|
+
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
1811
|
+
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
1812
|
+
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
1813
|
+
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
1814
|
+
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
1815
|
+
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
1816
|
+
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
1817
|
+
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
1818
|
+
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
1819
|
+
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
1820
|
+
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
1821
|
+
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
1822
|
+
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
1823
|
+
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
1824
|
+
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
1825
|
+
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
1826
|
+
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
1827
|
+
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
1828
|
+
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
1829
|
+
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
1830
|
+
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
1831
|
+
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
1832
|
+
};
|
1833
|
+
|
1684
1834
|
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
1685
1835
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
1686
1836
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
@@ -1823,6 +1973,49 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
1823
1973
|
|
1824
1974
|
}
|
1825
1975
|
|
1976
|
+
template<typename dst_t>
|
1977
|
+
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1978
|
+
|
1979
|
+
const int i = blockIdx.x;
|
1980
|
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
1981
|
+
|
1982
|
+
const int tid = threadIdx.x;
|
1983
|
+
#if QK_K == 256
|
1984
|
+
const int il = tid/8; // 0...3
|
1985
|
+
const int ib = tid%8; // 0...7
|
1986
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1987
|
+
const int i8 = 4*ib+il;
|
1988
|
+
uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
|
1989
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
|
1990
|
+
const float d = (float)x[i].d * (2*(h & 7) + 1);
|
1991
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
|
1992
|
+
#else
|
1993
|
+
assert(false);
|
1994
|
+
#endif
|
1995
|
+
|
1996
|
+
}
|
1997
|
+
|
1998
|
+
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
1999
|
+
|
2000
|
+
template<typename dst_t>
|
2001
|
+
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
2002
|
+
|
2003
|
+
const int i = blockIdx.x;
|
2004
|
+
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
2005
|
+
|
2006
|
+
const int tid = threadIdx.x;
|
2007
|
+
const int il = tid/8; // 0...3
|
2008
|
+
const int ib = tid%8; // 0...7
|
2009
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
2010
|
+
const uint8_t * q4 = x[ib].qs + 4*il;
|
2011
|
+
const float d = (float)x[ib].d;
|
2012
|
+
for (int j = 0; j < 4; ++j) {
|
2013
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
2014
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
2015
|
+
}
|
2016
|
+
|
2017
|
+
}
|
2018
|
+
|
1826
2019
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1827
2020
|
|
1828
2021
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
@@ -4478,10 +4671,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4478
4671
|
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4479
4672
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
4480
4673
|
#else
|
4674
|
+
(void) ksigns64;
|
4481
4675
|
assert(false);
|
4482
4676
|
return 0.f;
|
4483
4677
|
#endif
|
4484
4678
|
#else
|
4679
|
+
(void) ksigns64;
|
4485
4680
|
assert(false);
|
4486
4681
|
return 0.f;
|
4487
4682
|
#endif
|
@@ -4522,6 +4717,99 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|
4522
4717
|
#endif
|
4523
4718
|
}
|
4524
4719
|
|
4720
|
+
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
4721
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4722
|
+
#if QK_K == 256
|
4723
|
+
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
4724
|
+
|
4725
|
+
const int ib32 = iqs;
|
4726
|
+
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
|
4727
|
+
const uint8_t h1 = bq1->scales[2*ib32+0];
|
4728
|
+
const uint8_t h2 = bq1->scales[2*ib32+1];
|
4729
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4730
|
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
4731
|
+
const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
4732
|
+
const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
4733
|
+
const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
4734
|
+
const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
4735
|
+
for (int j = 0; j < 2; ++j) {
|
4736
|
+
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
4737
|
+
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
4738
|
+
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
4739
|
+
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
4740
|
+
}
|
4741
|
+
#else
|
4742
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4743
|
+
const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
4744
|
+
const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
4745
|
+
const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
4746
|
+
const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
4747
|
+
for (int j = 0; j < 8; ++j) {
|
4748
|
+
sumi1 += q8[j+ 0] * grid1[j];
|
4749
|
+
sumi2 += q8[j+ 8] * grid2[j];
|
4750
|
+
sumi3 += q8[j+16] * grid3[j];
|
4751
|
+
sumi4 += q8[j+24] * grid4[j];
|
4752
|
+
}
|
4753
|
+
#endif
|
4754
|
+
const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
|
4755
|
+
return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
|
4756
|
+
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
|
4757
|
+
#else
|
4758
|
+
assert(false);
|
4759
|
+
return 0.f;
|
4760
|
+
#endif
|
4761
|
+
}
|
4762
|
+
|
4763
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4764
|
+
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
|
4765
|
+
int & val1, int & val2) {
|
4766
|
+
|
4767
|
+
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
4768
|
+
aux32 = q4 & 0x0f0f0f0f;
|
4769
|
+
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
4770
|
+
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
4771
|
+
val1 = v1 | (v2 << 16);
|
4772
|
+
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
4773
|
+
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
4774
|
+
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
4775
|
+
val2 = v1 | (v2 << 16);
|
4776
|
+
}
|
4777
|
+
#endif
|
4778
|
+
|
4779
|
+
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
4780
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4781
|
+
|
4782
|
+
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
4783
|
+
|
4784
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4785
|
+
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
4786
|
+
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
4787
|
+
|
4788
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
4789
|
+
|
4790
|
+
int v1, v2;
|
4791
|
+
int sumi1 = 0, sumi2 = 0;
|
4792
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
4793
|
+
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
4794
|
+
get_int_from_table_16(aux, values, v1, v2);
|
4795
|
+
sumi1 = __dp4a(v1, q8[l+0], sumi1);
|
4796
|
+
sumi2 = __dp4a(v2, q8[l+4], sumi2);
|
4797
|
+
}
|
4798
|
+
|
4799
|
+
#else
|
4800
|
+
const uint8_t * q4 = bq->qs + 4*iqs;
|
4801
|
+
const int8_t * q8 = bq8_1->qs + 4*iqs;
|
4802
|
+
|
4803
|
+
int sumi1 = 0, sumi2 = 0;
|
4804
|
+
for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
4805
|
+
sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
|
4806
|
+
sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
|
4807
|
+
}
|
4808
|
+
#endif
|
4809
|
+
const float d = (float)bq->d * __low2float(bq8_1->ds);
|
4810
|
+
return d * (sumi1 + sumi2);
|
4811
|
+
}
|
4812
|
+
|
4525
4813
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
4526
4814
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
4527
4815
|
static __device__ __forceinline__ void mul_mat_q(
|
@@ -5310,51 +5598,59 @@ template <bool need_check> static __global__ void
|
|
5310
5598
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
5311
5599
|
}
|
5312
5600
|
|
5313
|
-
|
5314
|
-
#define MMVQ_NWARPS_AMD_RDNA2 1
|
5315
|
-
#define MMVQ_NWARPS_AMD_OLD 4
|
5316
|
-
|
5317
|
-
template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
5601
|
+
template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
5318
5602
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5319
|
-
|
5603
|
+
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
5604
|
+
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
5320
5605
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5321
5606
|
static __global__ void mul_mat_vec_q(
|
5322
5607
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
5323
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int
|
5324
|
-
|
5325
|
-
const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
|
5608
|
+
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
5326
5609
|
|
5327
|
-
|
5328
|
-
|
5610
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
5611
|
+
constexpr int nwarps = 1;
|
5612
|
+
constexpr int rows_per_cuda_block = 1;
|
5613
|
+
#else
|
5614
|
+
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
5615
|
+
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
5616
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
5329
5617
|
|
5330
|
-
const
|
5331
|
-
const
|
5332
|
-
const
|
5618
|
+
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
5619
|
+
const int row0 = rows_per_cuda_block*blockIdx.x;
|
5620
|
+
const int blocks_per_row_x = ncols_x / qk;
|
5621
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
5622
|
+
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
5333
5623
|
|
5334
5624
|
// partial sum for each thread
|
5335
|
-
float tmp[
|
5625
|
+
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
5336
5626
|
|
5337
5627
|
const block_q_t * x = (const block_q_t *) vx;
|
5338
5628
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
5339
5629
|
|
5340
|
-
for (int
|
5341
|
-
const int
|
5342
|
-
|
5343
|
-
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
5630
|
+
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
5631
|
+
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
5344
5632
|
|
5345
|
-
|
5633
|
+
// x block quant index when casting the quants to int
|
5634
|
+
const int kqs = vdr * (tid % (qi/vdr));
|
5346
5635
|
|
5347
5636
|
#pragma unroll
|
5348
5637
|
for (int j = 0; j < ncols_y; ++j) {
|
5349
|
-
|
5638
|
+
#pragma unroll
|
5639
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5640
|
+
tmp[j][i] += vec_dot_q_cuda(
|
5641
|
+
&x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
|
5642
|
+
}
|
5350
5643
|
}
|
5351
5644
|
}
|
5352
5645
|
|
5353
|
-
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][
|
5646
|
+
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
5354
5647
|
if (threadIdx.y > 0) {
|
5355
5648
|
#pragma unroll
|
5356
5649
|
for (int j = 0; j < ncols_y; ++j) {
|
5357
|
-
|
5650
|
+
#pragma unroll
|
5651
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5652
|
+
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
5653
|
+
}
|
5358
5654
|
}
|
5359
5655
|
}
|
5360
5656
|
__syncthreads();
|
@@ -5366,13 +5662,16 @@ static __global__ void mul_mat_vec_q(
|
|
5366
5662
|
#pragma unroll
|
5367
5663
|
for (int j = 0; j < ncols_y; ++j) {
|
5368
5664
|
#pragma unroll
|
5369
|
-
for (int i = 0; i <
|
5370
|
-
|
5665
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5666
|
+
#pragma unroll
|
5667
|
+
for (int l = 0; l < nwarps-1; ++l) {
|
5668
|
+
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
5669
|
+
}
|
5670
|
+
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
5371
5671
|
}
|
5372
|
-
tmp[j] = warp_reduce_sum(tmp[j]);
|
5373
5672
|
|
5374
|
-
if (threadIdx.x
|
5375
|
-
dst[j*nrows_dst +
|
5673
|
+
if (threadIdx.x < rows_per_cuda_block) {
|
5674
|
+
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
5376
5675
|
}
|
5377
5676
|
}
|
5378
5677
|
}
|
@@ -5945,149 +6244,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
5945
6244
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
5946
6245
|
}
|
5947
6246
|
|
5948
|
-
template <bool vals_smem, int ncols_template, int block_size_template
|
5949
|
-
static __global__ void
|
5950
|
-
|
5951
|
-
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
5952
|
-
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
6247
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
6248
|
+
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
6249
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
5953
6250
|
|
5954
6251
|
const int tid = threadIdx.x;
|
5955
6252
|
const int rowx = blockIdx.x;
|
5956
|
-
const int rowy = rowx % nrows_y; // broadcast the mask
|
6253
|
+
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
|
5957
6254
|
|
5958
6255
|
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
5959
6256
|
|
5960
6257
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
5961
6258
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
5962
6259
|
|
5963
|
-
|
5964
|
-
half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
|
5965
|
-
// (shared memory) buffer to cache values between iterations:
|
5966
|
-
half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
|
5967
|
-
// if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
|
5968
|
-
// in that case col_smem == col_data must be enforced to avoid race conditions
|
6260
|
+
float slope = 0.0f;
|
5969
6261
|
|
5970
|
-
|
6262
|
+
// ALiBi
|
6263
|
+
if (max_bias > 0.0f) {
|
6264
|
+
const int h = rowx/nrows_y; // head index
|
5971
6265
|
|
5972
|
-
|
5973
|
-
|
5974
|
-
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
5975
|
-
const int col_smem = vals_smem ? col0 + tid : col_data;
|
6266
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
6267
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
5976
6268
|
|
5977
|
-
|
5978
|
-
const int iy = rowy*ncols_data + col_data;
|
5979
|
-
|
5980
|
-
half2 val;
|
5981
|
-
if (need_check && col_data + 0 >= ncols_data) {
|
5982
|
-
val.x = -INFINITY;
|
5983
|
-
} else {
|
5984
|
-
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
5985
|
-
}
|
5986
|
-
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
5987
|
-
val.y = -INFINITY;
|
5988
|
-
} else {
|
5989
|
-
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
5990
|
-
}
|
5991
|
-
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
5992
|
-
vals[col_smem] = val;
|
5993
|
-
}
|
5994
|
-
max_val = __hmax2(max_val, val);
|
6269
|
+
slope = powf(base, exp);
|
5995
6270
|
}
|
5996
6271
|
|
5997
|
-
// find the max value in the block
|
5998
|
-
max_val = warp_reduce_max(max_val);
|
5999
|
-
if (block_size > WARP_SIZE) {
|
6000
|
-
if (warp_id == 0) {
|
6001
|
-
buf_iw[lane_id] = -INFINITY;
|
6002
|
-
}
|
6003
|
-
__syncthreads();
|
6004
|
-
|
6005
|
-
if (lane_id == 0) {
|
6006
|
-
buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
|
6007
|
-
}
|
6008
|
-
__syncthreads();
|
6009
|
-
|
6010
|
-
max_val = __half2half2(buf_iw[lane_id]);
|
6011
|
-
max_val = warp_reduce_max(max_val);
|
6012
|
-
} else {
|
6013
|
-
max_val = __half2half2(__hmax(max_val.x, max_val.y));
|
6014
|
-
}
|
6015
|
-
|
6016
|
-
half2 tmp = make_half2(0.0f, 0.0f); // partial sums
|
6017
|
-
|
6018
|
-
#pragma unroll
|
6019
|
-
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
6020
|
-
const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
|
6021
|
-
|
6022
|
-
if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
|
6023
|
-
break;
|
6024
|
-
}
|
6025
|
-
|
6026
|
-
const half2 val = h2exp(vals[col_smem] - max_val);
|
6027
|
-
|
6028
|
-
tmp += val;
|
6029
|
-
vals[col_smem] = val;
|
6030
|
-
}
|
6031
|
-
|
6032
|
-
// find the sum of exps in the block
|
6033
|
-
tmp = warp_reduce_sum(tmp);
|
6034
|
-
if (block_size > WARP_SIZE) {
|
6035
|
-
if (warp_id == 0) {
|
6036
|
-
buf_iw[lane_id] = 0.0f;
|
6037
|
-
}
|
6038
|
-
__syncthreads();
|
6039
|
-
|
6040
|
-
if (lane_id == 0) {
|
6041
|
-
buf_iw[warp_id] = tmp.x + tmp.y;
|
6042
|
-
}
|
6043
|
-
__syncthreads();
|
6044
|
-
|
6045
|
-
tmp = __half2half2(buf_iw[lane_id]);
|
6046
|
-
tmp = warp_reduce_sum(tmp);
|
6047
|
-
} else {
|
6048
|
-
tmp = __half2half2(tmp.x + tmp.y);
|
6049
|
-
}
|
6050
|
-
|
6051
|
-
const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
|
6052
|
-
|
6053
|
-
#pragma unroll
|
6054
|
-
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
6055
|
-
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
6056
|
-
const int col_smem = vals_smem ? col0 + tid : col_data;
|
6057
|
-
|
6058
|
-
const int idst = rowx*ncols_data + col_data;
|
6059
|
-
const half2 result = vals[col_smem] * inv_sum;
|
6060
|
-
|
6061
|
-
if (need_check && col_data + 0 >= ncols_data) {
|
6062
|
-
return;
|
6063
|
-
}
|
6064
|
-
dst[idst] = result.x;
|
6065
|
-
|
6066
|
-
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
6067
|
-
return;
|
6068
|
-
}
|
6069
|
-
|
6070
|
-
dst[idst + WARP_SIZE] = result.y;
|
6071
|
-
}
|
6072
|
-
#else
|
6073
|
-
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
6074
|
-
NO_DEVICE_CODE;
|
6075
|
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
6076
|
-
}
|
6077
|
-
|
6078
|
-
template <bool vals_smem, int ncols_template, int block_size_template>
|
6079
|
-
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
6080
|
-
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
6081
|
-
|
6082
|
-
const int tid = threadIdx.x;
|
6083
|
-
const int rowx = blockIdx.x;
|
6084
|
-
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
6085
|
-
|
6086
|
-
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
6087
|
-
|
6088
|
-
const int warp_id = threadIdx.x / WARP_SIZE;
|
6089
|
-
const int lane_id = threadIdx.x % WARP_SIZE;
|
6090
|
-
|
6091
6272
|
extern __shared__ float data_soft_max_f32[];
|
6092
6273
|
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
6093
6274
|
// shared memory buffer to cache values between iterations:
|
@@ -6106,7 +6287,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
|
|
6106
6287
|
const int ix = rowx*ncols + col;
|
6107
6288
|
const int iy = rowy*ncols + col;
|
6108
6289
|
|
6109
|
-
const float val = x[ix]*scale + (
|
6290
|
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
6291
|
+
|
6110
6292
|
vals[col] = val;
|
6111
6293
|
max_val = max(max_val, val);
|
6112
6294
|
}
|
@@ -6667,6 +6849,18 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
|
|
6667
6849
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
6668
6850
|
}
|
6669
6851
|
|
6852
|
+
template<typename dst_t>
|
6853
|
+
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6854
|
+
const int nb = k / QK_K;
|
6855
|
+
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
6856
|
+
}
|
6857
|
+
|
6858
|
+
template<typename dst_t>
|
6859
|
+
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6860
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
6861
|
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6862
|
+
}
|
6863
|
+
|
6670
6864
|
template <typename src_t, typename dst_t>
|
6671
6865
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
6672
6866
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
@@ -6706,6 +6900,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6706
6900
|
return dequantize_row_iq2_xs_cuda;
|
6707
6901
|
case GGML_TYPE_IQ3_XXS:
|
6708
6902
|
return dequantize_row_iq3_xxs_cuda;
|
6903
|
+
case GGML_TYPE_IQ1_S:
|
6904
|
+
return dequantize_row_iq1_s_cuda;
|
6905
|
+
case GGML_TYPE_IQ4_NL:
|
6906
|
+
return dequantize_row_iq4_nl_cuda;
|
6709
6907
|
case GGML_TYPE_F32:
|
6710
6908
|
return convert_unary_cuda<float>;
|
6711
6909
|
default:
|
@@ -6741,6 +6939,10 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
6741
6939
|
return dequantize_row_iq2_xs_cuda;
|
6742
6940
|
case GGML_TYPE_IQ3_XXS:
|
6743
6941
|
return dequantize_row_iq3_xxs_cuda;
|
6942
|
+
case GGML_TYPE_IQ1_S:
|
6943
|
+
return dequantize_row_iq1_s_cuda;
|
6944
|
+
case GGML_TYPE_IQ4_NL:
|
6945
|
+
return dequantize_row_iq4_nl_cuda;
|
6744
6946
|
case GGML_TYPE_F16:
|
6745
6947
|
return convert_unary_cuda<half>;
|
6746
6948
|
default:
|
@@ -6851,65 +7053,75 @@ static void mul_mat_vec_q_cuda(
|
|
6851
7053
|
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
6852
7054
|
|
6853
7055
|
GGML_ASSERT(ncols_x % qk == 0);
|
6854
|
-
GGML_ASSERT(ncols_y <=
|
7056
|
+
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
6855
7057
|
|
6856
7058
|
int id;
|
6857
7059
|
CUDA_CHECK(cudaGetDevice(&id));
|
6858
7060
|
|
6859
|
-
|
6860
|
-
|
6861
|
-
nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
|
6862
|
-
} else {
|
6863
|
-
nwarps = MMVQ_NWARPS_NVIDIA;
|
6864
|
-
}
|
7061
|
+
int64_t nwarps = 1;
|
7062
|
+
int64_t rows_per_cuda_block = 1;
|
6865
7063
|
|
6866
|
-
|
6867
|
-
|
6868
|
-
|
6869
|
-
switch (nwarps) {
|
6870
|
-
case 1: switch(ncols_y) {
|
7064
|
+
if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
7065
|
+
switch(ncols_y) {
|
6871
7066
|
case 1:
|
6872
|
-
|
6873
|
-
|
7067
|
+
nwarps = 4;
|
7068
|
+
rows_per_cuda_block = 1;
|
6874
7069
|
break;
|
6875
7070
|
case 2:
|
6876
|
-
mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6877
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6878
|
-
break;
|
6879
7071
|
case 3:
|
6880
|
-
mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6881
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6882
|
-
break;
|
6883
7072
|
case 4:
|
6884
|
-
|
6885
|
-
|
7073
|
+
nwarps = 4;
|
7074
|
+
rows_per_cuda_block = 2;
|
6886
7075
|
break;
|
6887
|
-
|
6888
|
-
|
6889
|
-
|
6890
|
-
|
6891
|
-
|
6892
|
-
|
6893
|
-
mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
|
6894
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6895
|
-
break;
|
6896
|
-
case 2:
|
6897
|
-
mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6898
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6899
|
-
break;
|
6900
|
-
case 3:
|
6901
|
-
mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6902
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6903
|
-
break;
|
6904
|
-
case 4:
|
6905
|
-
mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
|
6906
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
7076
|
+
case 5:
|
7077
|
+
case 6:
|
7078
|
+
case 7:
|
7079
|
+
case 8:
|
7080
|
+
nwarps = 2;
|
7081
|
+
rows_per_cuda_block = 2;
|
6907
7082
|
break;
|
6908
7083
|
default:
|
6909
7084
|
GGML_ASSERT(false);
|
6910
7085
|
break;
|
6911
|
-
}
|
7086
|
+
}
|
7087
|
+
}
|
7088
|
+
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
7089
|
+
const dim3 block_nums(nblocks, 1, 1);
|
7090
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
6912
7091
|
|
7092
|
+
switch (ncols_y) {
|
7093
|
+
case 1:
|
7094
|
+
mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
|
7095
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7096
|
+
break;
|
7097
|
+
case 2:
|
7098
|
+
mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
|
7099
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7100
|
+
break;
|
7101
|
+
case 3:
|
7102
|
+
mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
|
7103
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7104
|
+
break;
|
7105
|
+
case 4:
|
7106
|
+
mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
|
7107
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7108
|
+
break;
|
7109
|
+
case 5:
|
7110
|
+
mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
|
7111
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7112
|
+
break;
|
7113
|
+
case 6:
|
7114
|
+
mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
|
7115
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7116
|
+
break;
|
7117
|
+
case 7:
|
7118
|
+
mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
|
7119
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7120
|
+
break;
|
7121
|
+
case 8:
|
7122
|
+
mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
|
7123
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7124
|
+
break;
|
6913
7125
|
default:
|
6914
7126
|
GGML_ASSERT(false);
|
6915
7127
|
break;
|
@@ -7568,89 +7780,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
7568
7780
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
7569
7781
|
}
|
7570
7782
|
|
7571
|
-
static void
|
7572
|
-
int nth = WARP_SIZE;
|
7573
|
-
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
7574
|
-
const dim3 block_dims(nth, 1, 1);
|
7575
|
-
const dim3 block_nums(nrows_x, 1, 1);
|
7576
|
-
const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
|
7577
|
-
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
7578
|
-
if (shmem <= g_device_caps[g_main_device].smpb) {
|
7579
|
-
switch (ncols_x) {
|
7580
|
-
case 32:
|
7581
|
-
soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7582
|
-
break;
|
7583
|
-
case 64:
|
7584
|
-
soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7585
|
-
break;
|
7586
|
-
case 128:
|
7587
|
-
soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7588
|
-
break;
|
7589
|
-
case 256:
|
7590
|
-
soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7591
|
-
break;
|
7592
|
-
case 512:
|
7593
|
-
soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7594
|
-
break;
|
7595
|
-
case 1024:
|
7596
|
-
soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7597
|
-
break;
|
7598
|
-
case 2048:
|
7599
|
-
soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7600
|
-
break;
|
7601
|
-
case 4096:
|
7602
|
-
soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7603
|
-
break;
|
7604
|
-
default:
|
7605
|
-
soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7606
|
-
break;
|
7607
|
-
}
|
7608
|
-
} else {
|
7609
|
-
const size_t shmem_low = WARP_SIZE*sizeof(half);
|
7610
|
-
soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7611
|
-
}
|
7612
|
-
}
|
7613
|
-
|
7614
|
-
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
7783
|
+
static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
7615
7784
|
int nth = WARP_SIZE;
|
7616
7785
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
7617
7786
|
const dim3 block_dims(nth, 1, 1);
|
7618
7787
|
const dim3 block_nums(nrows_x, 1, 1);
|
7619
7788
|
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
7620
7789
|
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
7790
|
+
|
7791
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
7792
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
7793
|
+
|
7794
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
7795
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
7796
|
+
|
7621
7797
|
if (shmem < g_device_caps[g_main_device].smpb) {
|
7622
7798
|
switch (ncols_x) {
|
7623
7799
|
case 32:
|
7624
|
-
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x,
|
7800
|
+
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7625
7801
|
break;
|
7626
7802
|
case 64:
|
7627
|
-
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x,
|
7803
|
+
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7628
7804
|
break;
|
7629
7805
|
case 128:
|
7630
|
-
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x,
|
7806
|
+
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7631
7807
|
break;
|
7632
7808
|
case 256:
|
7633
|
-
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x,
|
7809
|
+
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7634
7810
|
break;
|
7635
7811
|
case 512:
|
7636
|
-
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x,
|
7812
|
+
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7637
7813
|
break;
|
7638
7814
|
case 1024:
|
7639
|
-
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7815
|
+
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7640
7816
|
break;
|
7641
7817
|
case 2048:
|
7642
|
-
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7818
|
+
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7643
7819
|
break;
|
7644
7820
|
case 4096:
|
7645
|
-
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7821
|
+
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7646
7822
|
break;
|
7647
7823
|
default:
|
7648
|
-
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x,
|
7824
|
+
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7649
7825
|
break;
|
7650
7826
|
}
|
7651
7827
|
} else {
|
7652
7828
|
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
7653
|
-
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x,
|
7829
|
+
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7654
7830
|
}
|
7655
7831
|
}
|
7656
7832
|
|
@@ -7922,6 +8098,7 @@ GGML_CALL void ggml_init_cublas() {
|
|
7922
8098
|
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
7923
8099
|
initialized = true;
|
7924
8100
|
g_cublas_loaded = false;
|
8101
|
+
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
|
7925
8102
|
return;
|
7926
8103
|
}
|
7927
8104
|
|
@@ -8509,6 +8686,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8509
8686
|
case GGML_TYPE_IQ2_XXS:
|
8510
8687
|
case GGML_TYPE_IQ2_XS:
|
8511
8688
|
case GGML_TYPE_IQ3_XXS:
|
8689
|
+
case GGML_TYPE_IQ1_S:
|
8690
|
+
case GGML_TYPE_IQ4_NL:
|
8512
8691
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
8513
8692
|
default:
|
8514
8693
|
GGML_ASSERT(false);
|
@@ -8532,6 +8711,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8532
8711
|
case GGML_TYPE_IQ2_XXS:
|
8533
8712
|
case GGML_TYPE_IQ2_XS:
|
8534
8713
|
case GGML_TYPE_IQ3_XXS:
|
8714
|
+
case GGML_TYPE_IQ1_S:
|
8715
|
+
case GGML_TYPE_IQ4_NL:
|
8535
8716
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
8536
8717
|
case GGML_TYPE_Q6_K:
|
8537
8718
|
return 64;
|
@@ -8629,6 +8810,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8629
8810
|
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
8630
8811
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8631
8812
|
break;
|
8813
|
+
case GGML_TYPE_IQ1_S:
|
8814
|
+
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
8815
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8816
|
+
break;
|
8817
|
+
case GGML_TYPE_IQ4_NL:
|
8818
|
+
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
8819
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8820
|
+
break;
|
8632
8821
|
default:
|
8633
8822
|
GGML_ASSERT(false);
|
8634
8823
|
break;
|
@@ -9068,30 +9257,36 @@ static void ggml_cuda_op_soft_max(
|
|
9068
9257
|
|
9069
9258
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
9070
9259
|
|
9071
|
-
const int64_t ne00
|
9260
|
+
const int64_t ne00 = src0->ne[0];
|
9072
9261
|
const int64_t nrows_x = ggml_nrows(src0);
|
9073
|
-
const int64_t nrows_y =
|
9262
|
+
const int64_t nrows_y = src0->ne[1];
|
9074
9263
|
|
9075
|
-
float scale
|
9076
|
-
|
9264
|
+
float scale = 1.0f;
|
9265
|
+
float max_bias = 0.0f;
|
9077
9266
|
|
9078
|
-
|
9079
|
-
|
9080
|
-
const bool use_f16_soft_max = true;
|
9081
|
-
#else
|
9082
|
-
const bool use_f16_soft_max = false;
|
9083
|
-
#endif // GGML_CUDA_F16
|
9084
|
-
#else
|
9085
|
-
const bool use_f16_soft_max = false;
|
9086
|
-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
9267
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
9268
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
9087
9269
|
|
9088
|
-
|
9089
|
-
|
9090
|
-
|
9091
|
-
|
9270
|
+
// positions tensor
|
9271
|
+
float * src2_dd = nullptr;
|
9272
|
+
cuda_pool_alloc<float> src2_f;
|
9273
|
+
|
9274
|
+
ggml_tensor * src2 = dst->src[2];
|
9275
|
+
const bool use_src2 = src2 != nullptr;
|
9276
|
+
|
9277
|
+
if (use_src2) {
|
9278
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
9279
|
+
|
9280
|
+
if (src2_on_device) {
|
9281
|
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
9282
|
+
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
9283
|
+
} else {
|
9284
|
+
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
9285
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
9286
|
+
}
|
9092
9287
|
}
|
9093
9288
|
|
9094
|
-
(
|
9289
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
|
9095
9290
|
}
|
9096
9291
|
|
9097
9292
|
static void ggml_cuda_op_scale(
|
@@ -9226,9 +9421,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
9226
9421
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
9227
9422
|
if (can_access_peer) {
|
9228
9423
|
if (enable_peer_access) {
|
9229
|
-
|
9424
|
+
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
9425
|
+
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
9426
|
+
CUDA_CHECK(err);
|
9427
|
+
}
|
9230
9428
|
} else {
|
9231
|
-
|
9429
|
+
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
9430
|
+
if (err != cudaErrorPeerAccessNotEnabled) {
|
9431
|
+
CUDA_CHECK(err);
|
9432
|
+
}
|
9232
9433
|
}
|
9233
9434
|
}
|
9234
9435
|
}
|
@@ -9735,7 +9936,7 @@ static __global__ void k_compute_batched_ptrs(
|
|
9735
9936
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
9736
9937
|
}
|
9737
9938
|
|
9738
|
-
static void
|
9939
|
+
static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9739
9940
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9740
9941
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9741
9942
|
|
@@ -9893,39 +10094,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9893
10094
|
|
9894
10095
|
int64_t min_compute_capability = INT_MAX;
|
9895
10096
|
|
10097
|
+
bool any_pascal_with_slow_fp16 = false;
|
9896
10098
|
if (split) {
|
9897
10099
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9898
10100
|
auto & tensor_split = buft_ctx->tensor_split;
|
9899
10101
|
for (int id = 0; id < g_device_count; ++id) {
|
9900
|
-
|
10102
|
+
// skip devices that are not going to do any work:
|
10103
|
+
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
10104
|
+
continue;
|
10105
|
+
}
|
10106
|
+
|
10107
|
+
if (min_compute_capability > g_device_caps[id].cc) {
|
9901
10108
|
min_compute_capability = g_device_caps[id].cc;
|
9902
10109
|
}
|
10110
|
+
if (g_device_caps[id].cc == 610) {
|
10111
|
+
any_pascal_with_slow_fp16 = true;
|
10112
|
+
}
|
9903
10113
|
}
|
9904
10114
|
} else {
|
9905
|
-
min_compute_capability
|
10115
|
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
10116
|
+
any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
|
9906
10117
|
}
|
9907
10118
|
|
10119
|
+
// check data types and tensor shapes for custom matrix multiplication kernels:
|
10120
|
+
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
10121
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
10122
|
+
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
10123
|
+
|
10124
|
+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
10125
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
10126
|
+
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
10127
|
+
|
10128
|
+
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
10129
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
10130
|
+
|
9908
10131
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
9909
10132
|
|
9910
10133
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
9911
|
-
|
10134
|
+
|
9912
10135
|
#ifdef CUDA_USE_TENSOR_CORES
|
9913
10136
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
9914
10137
|
#endif // CUDA_USE_TENSOR_CORES
|
9915
10138
|
|
9916
10139
|
#else
|
9917
10140
|
|
9918
|
-
|
9919
|
-
bool
|
10141
|
+
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
10142
|
+
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
10143
|
+
|
10144
|
+
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
10145
|
+
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
10146
|
+
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
10147
|
+
|
9920
10148
|
#ifdef CUDA_USE_TENSOR_CORES
|
9921
10149
|
// when tensor cores are available, use them for large batch size
|
9922
10150
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
9923
|
-
use_mul_mat_q
|
10151
|
+
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
9924
10152
|
#endif // CUDA_USE_TENSOR_CORES
|
9925
10153
|
|
9926
10154
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
9927
10155
|
|
9928
|
-
|
10156
|
+
// if mmvq is available it's a better choice than dmmv:
|
10157
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
10158
|
+
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
10159
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
9929
10160
|
|
9930
10161
|
// debug helpers
|
9931
10162
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
@@ -9943,33 +10174,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9943
10174
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
9944
10175
|
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
9945
10176
|
// KQ + KQV multi-batch
|
9946
|
-
|
9947
|
-
} else if (
|
9948
|
-
ggml_cuda_op_mul_mat(src0, src1, dst,
|
9949
|
-
} else if (
|
9950
|
-
|
9951
|
-
|
9952
|
-
|
9953
|
-
#else
|
9954
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
9955
|
-
#endif // GGML_CUDA_FORCE_DMMV
|
9956
|
-
|
9957
|
-
if (use_mul_mat_vec_q) {
|
9958
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9959
|
-
} else {
|
9960
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
9961
|
-
}
|
9962
|
-
} else {
|
9963
|
-
if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
|
9964
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9965
|
-
} else if (use_mul_mat_q) {
|
9966
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
9967
|
-
} else {
|
9968
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
9969
|
-
}
|
9970
|
-
}
|
10177
|
+
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
10178
|
+
} else if (use_dequantize_mul_mat_vec) {
|
10179
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
10180
|
+
} else if (use_mul_mat_vec_q) {
|
10181
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
10182
|
+
} else if (use_mul_mat_q) {
|
10183
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
9971
10184
|
} else {
|
9972
|
-
|
10185
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
9973
10186
|
}
|
9974
10187
|
}
|
9975
10188
|
|
@@ -10888,10 +11101,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
|
|
10888
11101
|
UNUSED(buffer);
|
10889
11102
|
}
|
10890
11103
|
|
10891
|
-
|
10892
|
-
|
10893
|
-
|
10894
|
-
|
11104
|
+
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
11105
|
+
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
11106
|
+
UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
11107
|
+
}
|
10895
11108
|
|
10896
11109
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10897
11110
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
@@ -11279,7 +11492,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11279
11492
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
11280
11493
|
if (node->src[j] != nullptr) {
|
11281
11494
|
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
11282
|
-
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
11495
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
11283
11496
|
assert(node->src[j]->extra != nullptr);
|
11284
11497
|
}
|
11285
11498
|
}
|
@@ -11327,7 +11540,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11327
11540
|
return false;
|
11328
11541
|
}
|
11329
11542
|
ggml_type a_type = a->type;
|
11330
|
-
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS
|
11543
|
+
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
11544
|
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
|
11331
11545
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
11332
11546
|
return false;
|
11333
11547
|
}
|