llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -1,3 +1,7 @@
|
|
1
|
+
#include "ggml-cuda.h"
|
2
|
+
#include "ggml.h"
|
3
|
+
#include "ggml-backend-impl.h"
|
4
|
+
|
1
5
|
#include <algorithm>
|
2
6
|
#include <assert.h>
|
3
7
|
#include <atomic>
|
@@ -54,6 +58,8 @@
|
|
54
58
|
#define cudaDeviceProp hipDeviceProp_t
|
55
59
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
56
60
|
#define cudaError_t hipError_t
|
61
|
+
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
62
|
+
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
57
63
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
58
64
|
#define cudaEventDisableTiming hipEventDisableTiming
|
59
65
|
#define cudaEventRecord hipEventRecord
|
@@ -119,11 +125,6 @@
|
|
119
125
|
|
120
126
|
#endif // defined(GGML_USE_HIPBLAS)
|
121
127
|
|
122
|
-
// ggml-cuda need half type so keep ggml headers include at last
|
123
|
-
#include "ggml-cuda.h"
|
124
|
-
#include "ggml.h"
|
125
|
-
#include "ggml-backend-impl.h"
|
126
|
-
|
127
128
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
128
129
|
|
129
130
|
#define CC_PASCAL 600
|
@@ -150,8 +151,8 @@
|
|
150
151
|
#define CUDA_USE_TENSOR_CORES
|
151
152
|
#endif
|
152
153
|
|
153
|
-
// max batch size to use
|
154
|
-
#define
|
154
|
+
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
|
155
|
+
#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
|
155
156
|
|
156
157
|
#if defined(GGML_USE_HIPBLAS)
|
157
158
|
#define __CUDA_ARCH__ 1300
|
@@ -517,6 +518,24 @@ typedef struct {
|
|
517
518
|
} block_iq3_xxs;
|
518
519
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
519
520
|
|
521
|
+
#define QR1_S 8
|
522
|
+
#define QI1_S (QK_K / (4*QR1_S))
|
523
|
+
typedef struct {
|
524
|
+
half d;
|
525
|
+
uint8_t qs[QK_K/8];
|
526
|
+
uint8_t scales[QK_K/16];
|
527
|
+
} block_iq1_s;
|
528
|
+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
529
|
+
|
530
|
+
#define QK4_NL 32
|
531
|
+
#define QR4_NL 2
|
532
|
+
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
533
|
+
typedef struct {
|
534
|
+
half d;
|
535
|
+
uint8_t qs[QK4_NL/2];
|
536
|
+
} block_iq4_nl;
|
537
|
+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
538
|
+
|
520
539
|
#define WARP_SIZE 32
|
521
540
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
522
541
|
|
@@ -642,18 +661,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
642
661
|
return a;
|
643
662
|
}
|
644
663
|
|
645
|
-
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
646
|
-
|
647
|
-
|
648
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
649
|
-
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
650
|
-
}
|
651
|
-
return a;
|
652
|
-
|
653
|
-
(void) a;
|
654
|
-
NO_DEVICE_CODE;
|
655
|
-
|
656
|
-
}
|
664
|
+
//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
665
|
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
666
|
+
//#pragma unroll
|
667
|
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
668
|
+
// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
669
|
+
// }
|
670
|
+
// return a;
|
671
|
+
//#else
|
672
|
+
// (void) a;
|
673
|
+
// NO_DEVICE_CODE;
|
674
|
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
675
|
+
//}
|
657
676
|
|
658
677
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
659
678
|
#pragma unroll
|
@@ -663,18 +682,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
663
682
|
return x;
|
664
683
|
}
|
665
684
|
|
666
|
-
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
667
|
-
|
668
|
-
|
669
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
670
|
-
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
671
|
-
}
|
672
|
-
return x;
|
673
|
-
|
674
|
-
(void) x;
|
675
|
-
NO_DEVICE_CODE;
|
676
|
-
|
677
|
-
}
|
685
|
+
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
686
|
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
687
|
+
//#pragma unroll
|
688
|
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
689
|
+
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
690
|
+
// }
|
691
|
+
// return x;
|
692
|
+
//#else
|
693
|
+
// (void) x;
|
694
|
+
// NO_DEVICE_CODE;
|
695
|
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
696
|
+
//}
|
678
697
|
|
679
698
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
680
699
|
return b;
|
@@ -1681,6 +1700,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
|
|
1681
1700
|
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
1682
1701
|
};
|
1683
1702
|
|
1703
|
+
static const __device__ uint64_t iq1s_grid[512] = {
|
1704
|
+
0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
|
1705
|
+
0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
|
1706
|
+
0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
|
1707
|
+
0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
|
1708
|
+
0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
|
1709
|
+
0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
|
1710
|
+
0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
|
1711
|
+
0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
|
1712
|
+
0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
|
1713
|
+
0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
|
1714
|
+
0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
|
1715
|
+
0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
|
1716
|
+
0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
|
1717
|
+
0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
|
1718
|
+
0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
|
1719
|
+
0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
|
1720
|
+
0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
|
1721
|
+
0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
|
1722
|
+
0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
|
1723
|
+
0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
|
1724
|
+
0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
|
1725
|
+
0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
|
1726
|
+
0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
|
1727
|
+
0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
|
1728
|
+
0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
|
1729
|
+
0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
|
1730
|
+
0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
|
1731
|
+
0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
|
1732
|
+
0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
|
1733
|
+
0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
|
1734
|
+
0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
|
1735
|
+
0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
|
1736
|
+
0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
|
1737
|
+
0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
|
1738
|
+
0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
|
1739
|
+
0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
|
1740
|
+
0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
|
1741
|
+
0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
|
1742
|
+
0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
|
1743
|
+
0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
|
1744
|
+
0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
|
1745
|
+
0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
|
1746
|
+
0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
|
1747
|
+
0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
|
1748
|
+
0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
|
1749
|
+
0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
|
1750
|
+
0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
|
1751
|
+
0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
|
1752
|
+
0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
|
1753
|
+
0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
|
1754
|
+
0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
|
1755
|
+
0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
|
1756
|
+
0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
|
1757
|
+
0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
|
1758
|
+
0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
|
1759
|
+
0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
|
1760
|
+
0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
|
1761
|
+
0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
|
1762
|
+
0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
|
1763
|
+
0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
|
1764
|
+
0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
|
1765
|
+
0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
|
1766
|
+
0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
|
1767
|
+
0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
|
1768
|
+
0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
|
1769
|
+
0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
|
1770
|
+
0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
|
1771
|
+
0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
|
1772
|
+
0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
|
1773
|
+
0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
|
1774
|
+
0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
|
1775
|
+
0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
|
1776
|
+
0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
|
1777
|
+
0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
|
1778
|
+
0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
|
1779
|
+
0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
|
1780
|
+
0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
|
1781
|
+
0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
|
1782
|
+
0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
|
1783
|
+
0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
|
1784
|
+
0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
|
1785
|
+
0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
|
1786
|
+
0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
|
1787
|
+
0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
|
1788
|
+
0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
|
1789
|
+
0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
|
1790
|
+
0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
|
1791
|
+
0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
|
1792
|
+
0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
|
1793
|
+
0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
|
1794
|
+
0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
|
1795
|
+
0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
|
1796
|
+
0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
|
1797
|
+
0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
|
1798
|
+
0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
|
1799
|
+
0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
|
1800
|
+
0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
|
1801
|
+
0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
|
1802
|
+
0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
|
1803
|
+
0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
|
1804
|
+
0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
|
1805
|
+
0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
|
1806
|
+
0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
|
1807
|
+
0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
|
1808
|
+
0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
|
1809
|
+
0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
|
1810
|
+
0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
|
1811
|
+
0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
|
1812
|
+
0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
|
1813
|
+
0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
|
1814
|
+
0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
|
1815
|
+
0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
|
1816
|
+
0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
|
1817
|
+
0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
|
1818
|
+
0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
|
1819
|
+
0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
|
1820
|
+
0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
|
1821
|
+
0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
|
1822
|
+
0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
|
1823
|
+
0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
|
1824
|
+
0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
|
1825
|
+
0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
|
1826
|
+
0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
|
1827
|
+
0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
|
1828
|
+
0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
|
1829
|
+
0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
|
1830
|
+
0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
|
1831
|
+
0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
|
1832
|
+
};
|
1833
|
+
|
1684
1834
|
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
1685
1835
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
1686
1836
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
@@ -1823,6 +1973,49 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
|
1823
1973
|
|
1824
1974
|
}
|
1825
1975
|
|
1976
|
+
template<typename dst_t>
|
1977
|
+
static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1978
|
+
|
1979
|
+
const int i = blockIdx.x;
|
1980
|
+
const block_iq1_s * x = (const block_iq1_s *) vx;
|
1981
|
+
|
1982
|
+
const int tid = threadIdx.x;
|
1983
|
+
#if QK_K == 256
|
1984
|
+
const int il = tid/8; // 0...3
|
1985
|
+
const int ib = tid%8; // 0...7
|
1986
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1987
|
+
const int i8 = 4*ib+il;
|
1988
|
+
uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
|
1989
|
+
const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
|
1990
|
+
const float d = (float)x[i].d * (2*(h & 7) + 1);
|
1991
|
+
for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
|
1992
|
+
#else
|
1993
|
+
assert(false);
|
1994
|
+
#endif
|
1995
|
+
|
1996
|
+
}
|
1997
|
+
|
1998
|
+
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
1999
|
+
|
2000
|
+
template<typename dst_t>
|
2001
|
+
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
2002
|
+
|
2003
|
+
const int i = blockIdx.x;
|
2004
|
+
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
2005
|
+
|
2006
|
+
const int tid = threadIdx.x;
|
2007
|
+
const int il = tid/8; // 0...3
|
2008
|
+
const int ib = tid%8; // 0...7
|
2009
|
+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
2010
|
+
const uint8_t * q4 = x[ib].qs + 4*il;
|
2011
|
+
const float d = (float)x[ib].d;
|
2012
|
+
for (int j = 0; j < 4; ++j) {
|
2013
|
+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
2014
|
+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
2015
|
+
}
|
2016
|
+
|
2017
|
+
}
|
2018
|
+
|
1826
2019
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1827
2020
|
|
1828
2021
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
@@ -4478,10 +4671,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4478
4671
|
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4479
4672
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
4480
4673
|
#else
|
4674
|
+
(void) ksigns64;
|
4481
4675
|
assert(false);
|
4482
4676
|
return 0.f;
|
4483
4677
|
#endif
|
4484
4678
|
#else
|
4679
|
+
(void) ksigns64;
|
4485
4680
|
assert(false);
|
4486
4681
|
return 0.f;
|
4487
4682
|
#endif
|
@@ -4522,6 +4717,99 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
|
4522
4717
|
#endif
|
4523
4718
|
}
|
4524
4719
|
|
4720
|
+
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
4721
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4722
|
+
#if QK_K == 256
|
4723
|
+
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
4724
|
+
|
4725
|
+
const int ib32 = iqs;
|
4726
|
+
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
|
4727
|
+
const uint8_t h1 = bq1->scales[2*ib32+0];
|
4728
|
+
const uint8_t h2 = bq1->scales[2*ib32+1];
|
4729
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4730
|
+
const int * q8 = (const int *)bq8_1[ib32].qs;
|
4731
|
+
const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
4732
|
+
const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
4733
|
+
const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
4734
|
+
const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
4735
|
+
for (int j = 0; j < 2; ++j) {
|
4736
|
+
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
4737
|
+
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
4738
|
+
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
4739
|
+
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
4740
|
+
}
|
4741
|
+
#else
|
4742
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4743
|
+
const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
4744
|
+
const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
4745
|
+
const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
4746
|
+
const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
4747
|
+
for (int j = 0; j < 8; ++j) {
|
4748
|
+
sumi1 += q8[j+ 0] * grid1[j];
|
4749
|
+
sumi2 += q8[j+ 8] * grid2[j];
|
4750
|
+
sumi3 += q8[j+16] * grid3[j];
|
4751
|
+
sumi4 += q8[j+24] * grid4[j];
|
4752
|
+
}
|
4753
|
+
#endif
|
4754
|
+
const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
|
4755
|
+
return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
|
4756
|
+
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
|
4757
|
+
#else
|
4758
|
+
assert(false);
|
4759
|
+
return 0.f;
|
4760
|
+
#endif
|
4761
|
+
}
|
4762
|
+
|
4763
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4764
|
+
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
|
4765
|
+
int & val1, int & val2) {
|
4766
|
+
|
4767
|
+
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
4768
|
+
aux32 = q4 & 0x0f0f0f0f;
|
4769
|
+
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
4770
|
+
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
4771
|
+
val1 = v1 | (v2 << 16);
|
4772
|
+
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
4773
|
+
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
4774
|
+
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
4775
|
+
val2 = v1 | (v2 << 16);
|
4776
|
+
}
|
4777
|
+
#endif
|
4778
|
+
|
4779
|
+
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
4780
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4781
|
+
|
4782
|
+
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
4783
|
+
|
4784
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4785
|
+
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
4786
|
+
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
4787
|
+
|
4788
|
+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
4789
|
+
|
4790
|
+
int v1, v2;
|
4791
|
+
int sumi1 = 0, sumi2 = 0;
|
4792
|
+
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
4793
|
+
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
4794
|
+
get_int_from_table_16(aux, values, v1, v2);
|
4795
|
+
sumi1 = __dp4a(v1, q8[l+0], sumi1);
|
4796
|
+
sumi2 = __dp4a(v2, q8[l+4], sumi2);
|
4797
|
+
}
|
4798
|
+
|
4799
|
+
#else
|
4800
|
+
const uint8_t * q4 = bq->qs + 4*iqs;
|
4801
|
+
const int8_t * q8 = bq8_1->qs + 4*iqs;
|
4802
|
+
|
4803
|
+
int sumi1 = 0, sumi2 = 0;
|
4804
|
+
for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
4805
|
+
sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
|
4806
|
+
sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
|
4807
|
+
}
|
4808
|
+
#endif
|
4809
|
+
const float d = (float)bq->d * __low2float(bq8_1->ds);
|
4810
|
+
return d * (sumi1 + sumi2);
|
4811
|
+
}
|
4812
|
+
|
4525
4813
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
4526
4814
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
4527
4815
|
static __device__ __forceinline__ void mul_mat_q(
|
@@ -5310,51 +5598,59 @@ template <bool need_check> static __global__ void
|
|
5310
5598
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
5311
5599
|
}
|
5312
5600
|
|
5313
|
-
|
5314
|
-
#define MMVQ_NWARPS_AMD_RDNA2 1
|
5315
|
-
#define MMVQ_NWARPS_AMD_OLD 4
|
5316
|
-
|
5317
|
-
template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
5601
|
+
template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
5318
5602
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5319
|
-
|
5603
|
+
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
5604
|
+
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
5320
5605
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5321
5606
|
static __global__ void mul_mat_vec_q(
|
5322
5607
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
5323
|
-
const int ncols_x, const int nrows_x, const int nrows_y, const int
|
5324
|
-
|
5325
|
-
const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
|
5608
|
+
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
5326
5609
|
|
5327
|
-
|
5328
|
-
|
5610
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
5611
|
+
constexpr int nwarps = 1;
|
5612
|
+
constexpr int rows_per_cuda_block = 1;
|
5613
|
+
#else
|
5614
|
+
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
5615
|
+
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
5616
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
5329
5617
|
|
5330
|
-
const
|
5331
|
-
const
|
5332
|
-
const
|
5618
|
+
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
5619
|
+
const int row0 = rows_per_cuda_block*blockIdx.x;
|
5620
|
+
const int blocks_per_row_x = ncols_x / qk;
|
5621
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
5622
|
+
constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
5333
5623
|
|
5334
5624
|
// partial sum for each thread
|
5335
|
-
float tmp[
|
5625
|
+
float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
|
5336
5626
|
|
5337
5627
|
const block_q_t * x = (const block_q_t *) vx;
|
5338
5628
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
5339
5629
|
|
5340
|
-
for (int
|
5341
|
-
const int
|
5342
|
-
|
5343
|
-
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
5630
|
+
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
|
5631
|
+
const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
|
5344
5632
|
|
5345
|
-
|
5633
|
+
// x block quant index when casting the quants to int
|
5634
|
+
const int kqs = vdr * (tid % (qi/vdr));
|
5346
5635
|
|
5347
5636
|
#pragma unroll
|
5348
5637
|
for (int j = 0; j < ncols_y; ++j) {
|
5349
|
-
|
5638
|
+
#pragma unroll
|
5639
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5640
|
+
tmp[j][i] += vec_dot_q_cuda(
|
5641
|
+
&x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
|
5642
|
+
}
|
5350
5643
|
}
|
5351
5644
|
}
|
5352
5645
|
|
5353
|
-
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][
|
5646
|
+
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE];
|
5354
5647
|
if (threadIdx.y > 0) {
|
5355
5648
|
#pragma unroll
|
5356
5649
|
for (int j = 0; j < ncols_y; ++j) {
|
5357
|
-
|
5650
|
+
#pragma unroll
|
5651
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5652
|
+
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
|
5653
|
+
}
|
5358
5654
|
}
|
5359
5655
|
}
|
5360
5656
|
__syncthreads();
|
@@ -5366,13 +5662,16 @@ static __global__ void mul_mat_vec_q(
|
|
5366
5662
|
#pragma unroll
|
5367
5663
|
for (int j = 0; j < ncols_y; ++j) {
|
5368
5664
|
#pragma unroll
|
5369
|
-
for (int i = 0; i <
|
5370
|
-
|
5665
|
+
for (int i = 0; i < rows_per_cuda_block; ++i) {
|
5666
|
+
#pragma unroll
|
5667
|
+
for (int l = 0; l < nwarps-1; ++l) {
|
5668
|
+
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
|
5669
|
+
}
|
5670
|
+
tmp[j][i] = warp_reduce_sum(tmp[j][i]);
|
5371
5671
|
}
|
5372
|
-
tmp[j] = warp_reduce_sum(tmp[j]);
|
5373
5672
|
|
5374
|
-
if (threadIdx.x
|
5375
|
-
dst[j*nrows_dst +
|
5673
|
+
if (threadIdx.x < rows_per_cuda_block) {
|
5674
|
+
dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
|
5376
5675
|
}
|
5377
5676
|
}
|
5378
5677
|
}
|
@@ -5945,149 +6244,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
5945
6244
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
|
5946
6245
|
}
|
5947
6246
|
|
5948
|
-
template <bool vals_smem, int ncols_template, int block_size_template
|
5949
|
-
static __global__ void
|
5950
|
-
|
5951
|
-
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
5952
|
-
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
6247
|
+
template <bool vals_smem, int ncols_template, int block_size_template>
|
6248
|
+
static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
6249
|
+
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
5953
6250
|
|
5954
6251
|
const int tid = threadIdx.x;
|
5955
6252
|
const int rowx = blockIdx.x;
|
5956
|
-
const int rowy = rowx % nrows_y; // broadcast the mask
|
6253
|
+
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
|
5957
6254
|
|
5958
6255
|
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
5959
6256
|
|
5960
6257
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
5961
6258
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
5962
6259
|
|
5963
|
-
|
5964
|
-
half * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
|
5965
|
-
// (shared memory) buffer to cache values between iterations:
|
5966
|
-
half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
|
5967
|
-
// if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
|
5968
|
-
// in that case col_smem == col_data must be enforced to avoid race conditions
|
6260
|
+
float slope = 0.0f;
|
5969
6261
|
|
5970
|
-
|
6262
|
+
// ALiBi
|
6263
|
+
if (max_bias > 0.0f) {
|
6264
|
+
const int h = rowx/nrows_y; // head index
|
5971
6265
|
|
5972
|
-
|
5973
|
-
|
5974
|
-
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
5975
|
-
const int col_smem = vals_smem ? col0 + tid : col_data;
|
6266
|
+
const float base = h < n_head_log2 ? m0 : m1;
|
6267
|
+
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
5976
6268
|
|
5977
|
-
|
5978
|
-
const int iy = rowy*ncols_data + col_data;
|
5979
|
-
|
5980
|
-
half2 val;
|
5981
|
-
if (need_check && col_data + 0 >= ncols_data) {
|
5982
|
-
val.x = -INFINITY;
|
5983
|
-
} else {
|
5984
|
-
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
5985
|
-
}
|
5986
|
-
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
5987
|
-
val.y = -INFINITY;
|
5988
|
-
} else {
|
5989
|
-
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
5990
|
-
}
|
5991
|
-
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
5992
|
-
vals[col_smem] = val;
|
5993
|
-
}
|
5994
|
-
max_val = __hmax2(max_val, val);
|
6269
|
+
slope = powf(base, exp);
|
5995
6270
|
}
|
5996
6271
|
|
5997
|
-
// find the max value in the block
|
5998
|
-
max_val = warp_reduce_max(max_val);
|
5999
|
-
if (block_size > WARP_SIZE) {
|
6000
|
-
if (warp_id == 0) {
|
6001
|
-
buf_iw[lane_id] = -INFINITY;
|
6002
|
-
}
|
6003
|
-
__syncthreads();
|
6004
|
-
|
6005
|
-
if (lane_id == 0) {
|
6006
|
-
buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
|
6007
|
-
}
|
6008
|
-
__syncthreads();
|
6009
|
-
|
6010
|
-
max_val = __half2half2(buf_iw[lane_id]);
|
6011
|
-
max_val = warp_reduce_max(max_val);
|
6012
|
-
} else {
|
6013
|
-
max_val = __half2half2(__hmax(max_val.x, max_val.y));
|
6014
|
-
}
|
6015
|
-
|
6016
|
-
half2 tmp = make_half2(0.0f, 0.0f); // partial sums
|
6017
|
-
|
6018
|
-
#pragma unroll
|
6019
|
-
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
6020
|
-
const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
|
6021
|
-
|
6022
|
-
if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
|
6023
|
-
break;
|
6024
|
-
}
|
6025
|
-
|
6026
|
-
const half2 val = h2exp(vals[col_smem] - max_val);
|
6027
|
-
|
6028
|
-
tmp += val;
|
6029
|
-
vals[col_smem] = val;
|
6030
|
-
}
|
6031
|
-
|
6032
|
-
// find the sum of exps in the block
|
6033
|
-
tmp = warp_reduce_sum(tmp);
|
6034
|
-
if (block_size > WARP_SIZE) {
|
6035
|
-
if (warp_id == 0) {
|
6036
|
-
buf_iw[lane_id] = 0.0f;
|
6037
|
-
}
|
6038
|
-
__syncthreads();
|
6039
|
-
|
6040
|
-
if (lane_id == 0) {
|
6041
|
-
buf_iw[warp_id] = tmp.x + tmp.y;
|
6042
|
-
}
|
6043
|
-
__syncthreads();
|
6044
|
-
|
6045
|
-
tmp = __half2half2(buf_iw[lane_id]);
|
6046
|
-
tmp = warp_reduce_sum(tmp);
|
6047
|
-
} else {
|
6048
|
-
tmp = __half2half2(tmp.x + tmp.y);
|
6049
|
-
}
|
6050
|
-
|
6051
|
-
const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
|
6052
|
-
|
6053
|
-
#pragma unroll
|
6054
|
-
for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
|
6055
|
-
const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
|
6056
|
-
const int col_smem = vals_smem ? col0 + tid : col_data;
|
6057
|
-
|
6058
|
-
const int idst = rowx*ncols_data + col_data;
|
6059
|
-
const half2 result = vals[col_smem] * inv_sum;
|
6060
|
-
|
6061
|
-
if (need_check && col_data + 0 >= ncols_data) {
|
6062
|
-
return;
|
6063
|
-
}
|
6064
|
-
dst[idst] = result.x;
|
6065
|
-
|
6066
|
-
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
6067
|
-
return;
|
6068
|
-
}
|
6069
|
-
|
6070
|
-
dst[idst + WARP_SIZE] = result.y;
|
6071
|
-
}
|
6072
|
-
#else
|
6073
|
-
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
6074
|
-
NO_DEVICE_CODE;
|
6075
|
-
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
6076
|
-
}
|
6077
|
-
|
6078
|
-
template <bool vals_smem, int ncols_template, int block_size_template>
|
6079
|
-
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
6080
|
-
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
6081
|
-
|
6082
|
-
const int tid = threadIdx.x;
|
6083
|
-
const int rowx = blockIdx.x;
|
6084
|
-
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
6085
|
-
|
6086
|
-
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
6087
|
-
|
6088
|
-
const int warp_id = threadIdx.x / WARP_SIZE;
|
6089
|
-
const int lane_id = threadIdx.x % WARP_SIZE;
|
6090
|
-
|
6091
6272
|
extern __shared__ float data_soft_max_f32[];
|
6092
6273
|
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
6093
6274
|
// shared memory buffer to cache values between iterations:
|
@@ -6106,7 +6287,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
|
|
6106
6287
|
const int ix = rowx*ncols + col;
|
6107
6288
|
const int iy = rowy*ncols + col;
|
6108
6289
|
|
6109
|
-
const float val = x[ix]*scale + (
|
6290
|
+
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
6291
|
+
|
6110
6292
|
vals[col] = val;
|
6111
6293
|
max_val = max(max_val, val);
|
6112
6294
|
}
|
@@ -6667,6 +6849,18 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
|
|
6667
6849
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
6668
6850
|
}
|
6669
6851
|
|
6852
|
+
template<typename dst_t>
|
6853
|
+
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6854
|
+
const int nb = k / QK_K;
|
6855
|
+
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
6856
|
+
}
|
6857
|
+
|
6858
|
+
template<typename dst_t>
|
6859
|
+
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6860
|
+
const int nb = (k + QK_K - 1) / QK_K;
|
6861
|
+
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
6862
|
+
}
|
6863
|
+
|
6670
6864
|
template <typename src_t, typename dst_t>
|
6671
6865
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
6672
6866
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
@@ -6706,6 +6900,10 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6706
6900
|
return dequantize_row_iq2_xs_cuda;
|
6707
6901
|
case GGML_TYPE_IQ3_XXS:
|
6708
6902
|
return dequantize_row_iq3_xxs_cuda;
|
6903
|
+
case GGML_TYPE_IQ1_S:
|
6904
|
+
return dequantize_row_iq1_s_cuda;
|
6905
|
+
case GGML_TYPE_IQ4_NL:
|
6906
|
+
return dequantize_row_iq4_nl_cuda;
|
6709
6907
|
case GGML_TYPE_F32:
|
6710
6908
|
return convert_unary_cuda<float>;
|
6711
6909
|
default:
|
@@ -6741,6 +6939,10 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
6741
6939
|
return dequantize_row_iq2_xs_cuda;
|
6742
6940
|
case GGML_TYPE_IQ3_XXS:
|
6743
6941
|
return dequantize_row_iq3_xxs_cuda;
|
6942
|
+
case GGML_TYPE_IQ1_S:
|
6943
|
+
return dequantize_row_iq1_s_cuda;
|
6944
|
+
case GGML_TYPE_IQ4_NL:
|
6945
|
+
return dequantize_row_iq4_nl_cuda;
|
6744
6946
|
case GGML_TYPE_F16:
|
6745
6947
|
return convert_unary_cuda<half>;
|
6746
6948
|
default:
|
@@ -6851,65 +7053,75 @@ static void mul_mat_vec_q_cuda(
|
|
6851
7053
|
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
6852
7054
|
|
6853
7055
|
GGML_ASSERT(ncols_x % qk == 0);
|
6854
|
-
GGML_ASSERT(ncols_y <=
|
7056
|
+
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
6855
7057
|
|
6856
7058
|
int id;
|
6857
7059
|
CUDA_CHECK(cudaGetDevice(&id));
|
6858
7060
|
|
6859
|
-
|
6860
|
-
|
6861
|
-
nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
|
6862
|
-
} else {
|
6863
|
-
nwarps = MMVQ_NWARPS_NVIDIA;
|
6864
|
-
}
|
7061
|
+
int64_t nwarps = 1;
|
7062
|
+
int64_t rows_per_cuda_block = 1;
|
6865
7063
|
|
6866
|
-
|
6867
|
-
|
6868
|
-
|
6869
|
-
switch (nwarps) {
|
6870
|
-
case 1: switch(ncols_y) {
|
7064
|
+
if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
7065
|
+
switch(ncols_y) {
|
6871
7066
|
case 1:
|
6872
|
-
|
6873
|
-
|
7067
|
+
nwarps = 4;
|
7068
|
+
rows_per_cuda_block = 1;
|
6874
7069
|
break;
|
6875
7070
|
case 2:
|
6876
|
-
mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6877
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6878
|
-
break;
|
6879
7071
|
case 3:
|
6880
|
-
mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6881
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6882
|
-
break;
|
6883
7072
|
case 4:
|
6884
|
-
|
6885
|
-
|
7073
|
+
nwarps = 4;
|
7074
|
+
rows_per_cuda_block = 2;
|
6886
7075
|
break;
|
6887
|
-
|
6888
|
-
|
6889
|
-
|
6890
|
-
|
6891
|
-
|
6892
|
-
|
6893
|
-
mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
|
6894
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6895
|
-
break;
|
6896
|
-
case 2:
|
6897
|
-
mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6898
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6899
|
-
break;
|
6900
|
-
case 3:
|
6901
|
-
mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6902
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6903
|
-
break;
|
6904
|
-
case 4:
|
6905
|
-
mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
|
6906
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
7076
|
+
case 5:
|
7077
|
+
case 6:
|
7078
|
+
case 7:
|
7079
|
+
case 8:
|
7080
|
+
nwarps = 2;
|
7081
|
+
rows_per_cuda_block = 2;
|
6907
7082
|
break;
|
6908
7083
|
default:
|
6909
7084
|
GGML_ASSERT(false);
|
6910
7085
|
break;
|
6911
|
-
}
|
7086
|
+
}
|
7087
|
+
}
|
7088
|
+
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
7089
|
+
const dim3 block_nums(nblocks, 1, 1);
|
7090
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
6912
7091
|
|
7092
|
+
switch (ncols_y) {
|
7093
|
+
case 1:
|
7094
|
+
mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
|
7095
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7096
|
+
break;
|
7097
|
+
case 2:
|
7098
|
+
mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
|
7099
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7100
|
+
break;
|
7101
|
+
case 3:
|
7102
|
+
mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
|
7103
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7104
|
+
break;
|
7105
|
+
case 4:
|
7106
|
+
mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
|
7107
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7108
|
+
break;
|
7109
|
+
case 5:
|
7110
|
+
mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
|
7111
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7112
|
+
break;
|
7113
|
+
case 6:
|
7114
|
+
mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
|
7115
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7116
|
+
break;
|
7117
|
+
case 7:
|
7118
|
+
mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
|
7119
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7120
|
+
break;
|
7121
|
+
case 8:
|
7122
|
+
mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
|
7123
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
7124
|
+
break;
|
6913
7125
|
default:
|
6914
7126
|
GGML_ASSERT(false);
|
6915
7127
|
break;
|
@@ -7568,89 +7780,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
7568
7780
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
7569
7781
|
}
|
7570
7782
|
|
7571
|
-
static void
|
7572
|
-
int nth = WARP_SIZE;
|
7573
|
-
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
7574
|
-
const dim3 block_dims(nth, 1, 1);
|
7575
|
-
const dim3 block_nums(nrows_x, 1, 1);
|
7576
|
-
const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
|
7577
|
-
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
7578
|
-
if (shmem <= g_device_caps[g_main_device].smpb) {
|
7579
|
-
switch (ncols_x) {
|
7580
|
-
case 32:
|
7581
|
-
soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7582
|
-
break;
|
7583
|
-
case 64:
|
7584
|
-
soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7585
|
-
break;
|
7586
|
-
case 128:
|
7587
|
-
soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7588
|
-
break;
|
7589
|
-
case 256:
|
7590
|
-
soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7591
|
-
break;
|
7592
|
-
case 512:
|
7593
|
-
soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7594
|
-
break;
|
7595
|
-
case 1024:
|
7596
|
-
soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7597
|
-
break;
|
7598
|
-
case 2048:
|
7599
|
-
soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7600
|
-
break;
|
7601
|
-
case 4096:
|
7602
|
-
soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7603
|
-
break;
|
7604
|
-
default:
|
7605
|
-
soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7606
|
-
break;
|
7607
|
-
}
|
7608
|
-
} else {
|
7609
|
-
const size_t shmem_low = WARP_SIZE*sizeof(half);
|
7610
|
-
soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
7611
|
-
}
|
7612
|
-
}
|
7613
|
-
|
7614
|
-
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
7783
|
+
static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
7615
7784
|
int nth = WARP_SIZE;
|
7616
7785
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
7617
7786
|
const dim3 block_dims(nth, 1, 1);
|
7618
7787
|
const dim3 block_nums(nrows_x, 1, 1);
|
7619
7788
|
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
7620
7789
|
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
7790
|
+
|
7791
|
+
const uint32_t n_head_kv = nrows_x/nrows_y;
|
7792
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
7793
|
+
|
7794
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
7795
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
7796
|
+
|
7621
7797
|
if (shmem < g_device_caps[g_main_device].smpb) {
|
7622
7798
|
switch (ncols_x) {
|
7623
7799
|
case 32:
|
7624
|
-
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x,
|
7800
|
+
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7625
7801
|
break;
|
7626
7802
|
case 64:
|
7627
|
-
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x,
|
7803
|
+
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7628
7804
|
break;
|
7629
7805
|
case 128:
|
7630
|
-
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x,
|
7806
|
+
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7631
7807
|
break;
|
7632
7808
|
case 256:
|
7633
|
-
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x,
|
7809
|
+
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7634
7810
|
break;
|
7635
7811
|
case 512:
|
7636
|
-
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x,
|
7812
|
+
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7637
7813
|
break;
|
7638
7814
|
case 1024:
|
7639
|
-
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7815
|
+
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7640
7816
|
break;
|
7641
7817
|
case 2048:
|
7642
|
-
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7818
|
+
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7643
7819
|
break;
|
7644
7820
|
case 4096:
|
7645
|
-
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x,
|
7821
|
+
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7646
7822
|
break;
|
7647
7823
|
default:
|
7648
|
-
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x,
|
7824
|
+
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7649
7825
|
break;
|
7650
7826
|
}
|
7651
7827
|
} else {
|
7652
7828
|
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
7653
|
-
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x,
|
7829
|
+
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
7654
7830
|
}
|
7655
7831
|
}
|
7656
7832
|
|
@@ -7922,6 +8098,7 @@ GGML_CALL void ggml_init_cublas() {
|
|
7922
8098
|
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
7923
8099
|
initialized = true;
|
7924
8100
|
g_cublas_loaded = false;
|
8101
|
+
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
|
7925
8102
|
return;
|
7926
8103
|
}
|
7927
8104
|
|
@@ -8509,6 +8686,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8509
8686
|
case GGML_TYPE_IQ2_XXS:
|
8510
8687
|
case GGML_TYPE_IQ2_XS:
|
8511
8688
|
case GGML_TYPE_IQ3_XXS:
|
8689
|
+
case GGML_TYPE_IQ1_S:
|
8690
|
+
case GGML_TYPE_IQ4_NL:
|
8512
8691
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
8513
8692
|
default:
|
8514
8693
|
GGML_ASSERT(false);
|
@@ -8532,6 +8711,8 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8532
8711
|
case GGML_TYPE_IQ2_XXS:
|
8533
8712
|
case GGML_TYPE_IQ2_XS:
|
8534
8713
|
case GGML_TYPE_IQ3_XXS:
|
8714
|
+
case GGML_TYPE_IQ1_S:
|
8715
|
+
case GGML_TYPE_IQ4_NL:
|
8535
8716
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
8536
8717
|
case GGML_TYPE_Q6_K:
|
8537
8718
|
return 64;
|
@@ -8629,6 +8810,14 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8629
8810
|
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
8630
8811
|
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8631
8812
|
break;
|
8813
|
+
case GGML_TYPE_IQ1_S:
|
8814
|
+
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
8815
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8816
|
+
break;
|
8817
|
+
case GGML_TYPE_IQ4_NL:
|
8818
|
+
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
8819
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8820
|
+
break;
|
8632
8821
|
default:
|
8633
8822
|
GGML_ASSERT(false);
|
8634
8823
|
break;
|
@@ -9068,30 +9257,36 @@ static void ggml_cuda_op_soft_max(
|
|
9068
9257
|
|
9069
9258
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
9070
9259
|
|
9071
|
-
const int64_t ne00
|
9260
|
+
const int64_t ne00 = src0->ne[0];
|
9072
9261
|
const int64_t nrows_x = ggml_nrows(src0);
|
9073
|
-
const int64_t nrows_y =
|
9262
|
+
const int64_t nrows_y = src0->ne[1];
|
9074
9263
|
|
9075
|
-
float scale
|
9076
|
-
|
9264
|
+
float scale = 1.0f;
|
9265
|
+
float max_bias = 0.0f;
|
9077
9266
|
|
9078
|
-
|
9079
|
-
|
9080
|
-
const bool use_f16_soft_max = true;
|
9081
|
-
#else
|
9082
|
-
const bool use_f16_soft_max = false;
|
9083
|
-
#endif // GGML_CUDA_F16
|
9084
|
-
#else
|
9085
|
-
const bool use_f16_soft_max = false;
|
9086
|
-
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
9267
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
9268
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
9087
9269
|
|
9088
|
-
|
9089
|
-
|
9090
|
-
|
9091
|
-
|
9270
|
+
// positions tensor
|
9271
|
+
float * src2_dd = nullptr;
|
9272
|
+
cuda_pool_alloc<float> src2_f;
|
9273
|
+
|
9274
|
+
ggml_tensor * src2 = dst->src[2];
|
9275
|
+
const bool use_src2 = src2 != nullptr;
|
9276
|
+
|
9277
|
+
if (use_src2) {
|
9278
|
+
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
9279
|
+
|
9280
|
+
if (src2_on_device) {
|
9281
|
+
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
9282
|
+
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
9283
|
+
} else {
|
9284
|
+
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
9285
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
9286
|
+
}
|
9092
9287
|
}
|
9093
9288
|
|
9094
|
-
(
|
9289
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
|
9095
9290
|
}
|
9096
9291
|
|
9097
9292
|
static void ggml_cuda_op_scale(
|
@@ -9226,9 +9421,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
9226
9421
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
9227
9422
|
if (can_access_peer) {
|
9228
9423
|
if (enable_peer_access) {
|
9229
|
-
|
9424
|
+
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
9425
|
+
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
9426
|
+
CUDA_CHECK(err);
|
9427
|
+
}
|
9230
9428
|
} else {
|
9231
|
-
|
9429
|
+
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
9430
|
+
if (err != cudaErrorPeerAccessNotEnabled) {
|
9431
|
+
CUDA_CHECK(err);
|
9432
|
+
}
|
9232
9433
|
}
|
9233
9434
|
}
|
9234
9435
|
}
|
@@ -9735,7 +9936,7 @@ static __global__ void k_compute_batched_ptrs(
|
|
9735
9936
|
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
9736
9937
|
}
|
9737
9938
|
|
9738
|
-
static void
|
9939
|
+
static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9739
9940
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
9740
9941
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
9741
9942
|
|
@@ -9893,39 +10094,69 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9893
10094
|
|
9894
10095
|
int64_t min_compute_capability = INT_MAX;
|
9895
10096
|
|
10097
|
+
bool any_pascal_with_slow_fp16 = false;
|
9896
10098
|
if (split) {
|
9897
10099
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
9898
10100
|
auto & tensor_split = buft_ctx->tensor_split;
|
9899
10101
|
for (int id = 0; id < g_device_count; ++id) {
|
9900
|
-
|
10102
|
+
// skip devices that are not going to do any work:
|
10103
|
+
if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
10104
|
+
continue;
|
10105
|
+
}
|
10106
|
+
|
10107
|
+
if (min_compute_capability > g_device_caps[id].cc) {
|
9901
10108
|
min_compute_capability = g_device_caps[id].cc;
|
9902
10109
|
}
|
10110
|
+
if (g_device_caps[id].cc == 610) {
|
10111
|
+
any_pascal_with_slow_fp16 = true;
|
10112
|
+
}
|
9903
10113
|
}
|
9904
10114
|
} else {
|
9905
|
-
min_compute_capability
|
10115
|
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
10116
|
+
any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610;
|
9906
10117
|
}
|
9907
10118
|
|
10119
|
+
// check data types and tensor shapes for custom matrix multiplication kernels:
|
10120
|
+
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
10121
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
10122
|
+
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
10123
|
+
|
10124
|
+
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
10125
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
10126
|
+
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
10127
|
+
|
10128
|
+
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
10129
|
+
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
10130
|
+
|
9908
10131
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
9909
10132
|
|
9910
10133
|
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
9911
|
-
|
10134
|
+
|
9912
10135
|
#ifdef CUDA_USE_TENSOR_CORES
|
9913
10136
|
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
9914
10137
|
#endif // CUDA_USE_TENSOR_CORES
|
9915
10138
|
|
9916
10139
|
#else
|
9917
10140
|
|
9918
|
-
|
9919
|
-
bool
|
10141
|
+
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
10142
|
+
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
10143
|
+
|
10144
|
+
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
10145
|
+
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
10146
|
+
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
10147
|
+
|
9920
10148
|
#ifdef CUDA_USE_TENSOR_CORES
|
9921
10149
|
// when tensor cores are available, use them for large batch size
|
9922
10150
|
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
9923
|
-
use_mul_mat_q
|
10151
|
+
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
9924
10152
|
#endif // CUDA_USE_TENSOR_CORES
|
9925
10153
|
|
9926
10154
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
9927
10155
|
|
9928
|
-
|
10156
|
+
// if mmvq is available it's a better choice than dmmv:
|
10157
|
+
#ifndef GGML_CUDA_FORCE_DMMV
|
10158
|
+
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
10159
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
9929
10160
|
|
9930
10161
|
// debug helpers
|
9931
10162
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
@@ -9943,33 +10174,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9943
10174
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
9944
10175
|
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
9945
10176
|
// KQ + KQV multi-batch
|
9946
|
-
|
9947
|
-
} else if (
|
9948
|
-
ggml_cuda_op_mul_mat(src0, src1, dst,
|
9949
|
-
} else if (
|
9950
|
-
|
9951
|
-
|
9952
|
-
|
9953
|
-
#else
|
9954
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
9955
|
-
#endif // GGML_CUDA_FORCE_DMMV
|
9956
|
-
|
9957
|
-
if (use_mul_mat_vec_q) {
|
9958
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9959
|
-
} else {
|
9960
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
9961
|
-
}
|
9962
|
-
} else {
|
9963
|
-
if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
|
9964
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9965
|
-
} else if (use_mul_mat_q) {
|
9966
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
9967
|
-
} else {
|
9968
|
-
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
9969
|
-
}
|
9970
|
-
}
|
10177
|
+
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
10178
|
+
} else if (use_dequantize_mul_mat_vec) {
|
10179
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
10180
|
+
} else if (use_mul_mat_vec_q) {
|
10181
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
10182
|
+
} else if (use_mul_mat_q) {
|
10183
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
9971
10184
|
} else {
|
9972
|
-
|
10185
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
9973
10186
|
}
|
9974
10187
|
}
|
9975
10188
|
|
@@ -10888,10 +11101,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
|
|
10888
11101
|
UNUSED(buffer);
|
10889
11102
|
}
|
10890
11103
|
|
10891
|
-
|
10892
|
-
|
10893
|
-
|
10894
|
-
|
11104
|
+
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
11105
|
+
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
11106
|
+
UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
11107
|
+
}
|
10895
11108
|
|
10896
11109
|
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
10897
11110
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
@@ -11279,7 +11492,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
|
11279
11492
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
11280
11493
|
if (node->src[j] != nullptr) {
|
11281
11494
|
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
11282
|
-
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
11495
|
+
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
11283
11496
|
assert(node->src[j]->extra != nullptr);
|
11284
11497
|
}
|
11285
11498
|
}
|
@@ -11327,7 +11540,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
11327
11540
|
return false;
|
11328
11541
|
}
|
11329
11542
|
ggml_type a_type = a->type;
|
11330
|
-
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS
|
11543
|
+
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
11544
|
+
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
|
11331
11545
|
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
11332
11546
|
return false;
|
11333
11547
|
}
|