llama_cpp 0.12.3 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +22 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -2
- data/vendor/tmp/llama.cpp/Makefile +160 -56
- data/vendor/tmp/llama.cpp/ggml-alloc.c +85 -25
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +115 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +121 -86
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +745 -109
- data/vendor/tmp/llama.cpp/ggml-quants.h +81 -56
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15296 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +51714 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5726 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +39 -0
- data/vendor/tmp/llama.cpp/ggml.c +356 -60
- data/vendor/tmp/llama.cpp/ggml.h +7 -1
- data/vendor/tmp/llama.cpp/llama.cpp +876 -118
- data/vendor/tmp/llama.cpp/llama.h +12 -16
- metadata +9 -2
@@ -12,9 +12,10 @@
|
|
12
12
|
#include <vector>
|
13
13
|
#include <map>
|
14
14
|
#include <array>
|
15
|
-
|
16
|
-
|
17
|
-
#
|
15
|
+
|
16
|
+
// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
|
17
|
+
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
18
|
+
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
18
19
|
|
19
20
|
#if defined(GGML_USE_HIPBLAS)
|
20
21
|
#include <hip/hip_runtime.h>
|
@@ -118,6 +119,11 @@
|
|
118
119
|
|
119
120
|
#endif // defined(GGML_USE_HIPBLAS)
|
120
121
|
|
122
|
+
// ggml-cuda need half type so keep ggml headers include at last
|
123
|
+
#include "ggml-cuda.h"
|
124
|
+
#include "ggml.h"
|
125
|
+
#include "ggml-backend-impl.h"
|
126
|
+
|
121
127
|
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
122
128
|
|
123
129
|
#define CC_PASCAL 600
|
@@ -185,6 +191,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
185
191
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
186
192
|
}
|
187
193
|
|
194
|
+
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
195
|
+
return __vsubss4(a, b);
|
196
|
+
}
|
197
|
+
|
188
198
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
189
199
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
190
200
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
@@ -499,6 +509,14 @@ typedef struct {
|
|
499
509
|
} block_iq2_xs;
|
500
510
|
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
501
511
|
|
512
|
+
#define QR3_XXS 8
|
513
|
+
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
514
|
+
typedef struct {
|
515
|
+
half d;
|
516
|
+
uint8_t qs[3*(QK_K/8)];
|
517
|
+
} block_iq3_xxs;
|
518
|
+
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
519
|
+
|
502
520
|
#define WARP_SIZE 32
|
503
521
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
504
522
|
|
@@ -506,6 +524,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|
506
524
|
#define CUDA_SILU_BLOCK_SIZE 256
|
507
525
|
#define CUDA_TANH_BLOCK_SIZE 256
|
508
526
|
#define CUDA_RELU_BLOCK_SIZE 256
|
527
|
+
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
528
|
+
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
509
529
|
#define CUDA_SQR_BLOCK_SIZE 256
|
510
530
|
#define CUDA_CPY_BLOCK_SIZE 32
|
511
531
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
@@ -522,6 +542,7 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|
522
542
|
#define CUDA_PAD_BLOCK_SIZE 256
|
523
543
|
#define CUDA_ACC_BLOCK_SIZE 256
|
524
544
|
#define CUDA_IM2COL_BLOCK_SIZE 256
|
545
|
+
#define CUDA_POOL2D_BLOCK_SIZE 256
|
525
546
|
|
526
547
|
#define CUDA_Q8_0_NE_ALIGN 2048
|
527
548
|
|
@@ -582,13 +603,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
|
|
582
603
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
583
604
|
|
584
605
|
[[noreturn]]
|
585
|
-
static __device__ void
|
586
|
-
|
606
|
+
static __device__ void no_device_code(
|
607
|
+
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
608
|
+
|
609
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
610
|
+
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
611
|
+
file_name, line, function_name, arch);
|
612
|
+
(void) arch_list;
|
613
|
+
#else
|
614
|
+
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
615
|
+
file_name, line, function_name, arch, arch_list);
|
616
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
587
617
|
__trap();
|
588
618
|
|
589
|
-
(void)
|
619
|
+
(void) no_device_code; // suppress unused function warning
|
590
620
|
}
|
591
621
|
|
622
|
+
#ifdef __CUDA_ARCH__
|
623
|
+
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
624
|
+
#else
|
625
|
+
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
626
|
+
#endif // __CUDA_ARCH__
|
627
|
+
|
592
628
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
593
629
|
#pragma unroll
|
594
630
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -615,7 +651,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
615
651
|
return a;
|
616
652
|
#else
|
617
653
|
(void) a;
|
618
|
-
|
654
|
+
NO_DEVICE_CODE;
|
619
655
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
620
656
|
}
|
621
657
|
|
@@ -636,7 +672,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
636
672
|
return x;
|
637
673
|
#else
|
638
674
|
(void) x;
|
639
|
-
|
675
|
+
NO_DEVICE_CODE;
|
640
676
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
641
677
|
}
|
642
678
|
|
@@ -790,6 +826,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|
790
826
|
dst[i] = fmaxf(x[i], 0);
|
791
827
|
}
|
792
828
|
|
829
|
+
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
830
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
831
|
+
|
832
|
+
if (i >= k) {
|
833
|
+
return;
|
834
|
+
}
|
835
|
+
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
836
|
+
}
|
837
|
+
|
838
|
+
static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
|
839
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
840
|
+
|
841
|
+
if (i >= k) {
|
842
|
+
return;
|
843
|
+
}
|
844
|
+
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
845
|
+
}
|
846
|
+
|
793
847
|
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
794
848
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
795
849
|
if (i >= k) {
|
@@ -1592,6 +1646,41 @@ static const __device__ uint64_t iq2xs_grid[512] = {
|
|
1592
1646
|
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
1593
1647
|
};
|
1594
1648
|
|
1649
|
+
static const __device__ uint32_t iq3xxs_grid[256] = {
|
1650
|
+
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
1651
|
+
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
1652
|
+
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
1653
|
+
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
1654
|
+
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
1655
|
+
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
1656
|
+
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
1657
|
+
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
1658
|
+
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
1659
|
+
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
1660
|
+
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
1661
|
+
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
1662
|
+
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
1663
|
+
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
1664
|
+
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
1665
|
+
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
1666
|
+
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
1667
|
+
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
1668
|
+
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
1669
|
+
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
1670
|
+
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
1671
|
+
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
1672
|
+
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
1673
|
+
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
1674
|
+
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
1675
|
+
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
1676
|
+
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
1677
|
+
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
1678
|
+
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
1679
|
+
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
1680
|
+
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
1681
|
+
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
1682
|
+
};
|
1683
|
+
|
1595
1684
|
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
1596
1685
|
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
1597
1686
|
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
@@ -1603,6 +1692,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = {
|
|
1603
1692
|
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
1604
1693
|
};
|
1605
1694
|
|
1695
|
+
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1696
|
+
static const __device__ uint64_t ksigns64[128] = {
|
1697
|
+
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
1698
|
+
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
1699
|
+
0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
1700
|
+
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
|
1701
|
+
0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
|
1702
|
+
0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
1703
|
+
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
|
1704
|
+
0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
|
1705
|
+
0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
1706
|
+
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
|
1707
|
+
0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
|
1708
|
+
0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
1709
|
+
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
|
1710
|
+
0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
|
1711
|
+
0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
1712
|
+
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
|
1713
|
+
0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
|
1714
|
+
0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
1715
|
+
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
|
1716
|
+
0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
|
1717
|
+
0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
1718
|
+
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
|
1719
|
+
0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
|
1720
|
+
0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
1721
|
+
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
|
1722
|
+
0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
|
1723
|
+
0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
1724
|
+
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
|
1725
|
+
0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
|
1726
|
+
0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
1727
|
+
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
1728
|
+
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
1729
|
+
};
|
1730
|
+
//#endif
|
1731
|
+
|
1606
1732
|
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
1607
1733
|
|
1608
1734
|
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
@@ -1669,6 +1795,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|
1669
1795
|
|
1670
1796
|
}
|
1671
1797
|
|
1798
|
+
template<typename dst_t>
|
1799
|
+
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
1800
|
+
|
1801
|
+
const int i = blockIdx.x;
|
1802
|
+
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
1803
|
+
|
1804
|
+
const int tid = threadIdx.x;
|
1805
|
+
#if QK_K == 256
|
1806
|
+
const int il = tid/8; // 0...3
|
1807
|
+
const int ib = tid%8; // 0...7
|
1808
|
+
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
1809
|
+
const uint8_t * q3 = x[i].qs + 8*ib;
|
1810
|
+
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
1811
|
+
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
1812
|
+
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
1813
|
+
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
1814
|
+
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
1815
|
+
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
1816
|
+
for (int j = 0; j < 4; ++j) {
|
1817
|
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
1818
|
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
1819
|
+
}
|
1820
|
+
#else
|
1821
|
+
assert(false);
|
1822
|
+
#endif
|
1823
|
+
|
1824
|
+
}
|
1825
|
+
|
1672
1826
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
1673
1827
|
|
1674
1828
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
@@ -2419,7 +2573,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|
2419
2573
|
}
|
2420
2574
|
#else
|
2421
2575
|
(void) vx; (void) y; (void) k;
|
2422
|
-
|
2576
|
+
NO_DEVICE_CODE;
|
2423
2577
|
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
2424
2578
|
}
|
2425
2579
|
|
@@ -2450,7 +2604,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
2450
2604
|
// second part effectively subtracts 8 from each quant value
|
2451
2605
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
2452
2606
|
#else
|
2453
|
-
|
2607
|
+
NO_DEVICE_CODE;
|
2454
2608
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2455
2609
|
}
|
2456
2610
|
|
@@ -2487,7 +2641,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
2487
2641
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
2488
2642
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
2489
2643
|
#else
|
2490
|
-
|
2644
|
+
NO_DEVICE_CODE;
|
2491
2645
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2492
2646
|
}
|
2493
2647
|
|
@@ -2522,7 +2676,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
2522
2676
|
// second part effectively subtracts 16 from each quant value
|
2523
2677
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
2524
2678
|
#else
|
2525
|
-
|
2679
|
+
NO_DEVICE_CODE;
|
2526
2680
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2527
2681
|
}
|
2528
2682
|
|
@@ -2567,7 +2721,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
2567
2721
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
2568
2722
|
|
2569
2723
|
#else
|
2570
|
-
|
2724
|
+
NO_DEVICE_CODE;
|
2571
2725
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2572
2726
|
}
|
2573
2727
|
|
@@ -2588,7 +2742,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
2588
2742
|
|
2589
2743
|
return d8_0*d8_1 * sumi;
|
2590
2744
|
#else
|
2591
|
-
|
2745
|
+
NO_DEVICE_CODE;
|
2592
2746
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2593
2747
|
}
|
2594
2748
|
|
@@ -2618,7 +2772,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
2618
2772
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
2619
2773
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
2620
2774
|
#else
|
2621
|
-
|
2775
|
+
NO_DEVICE_CODE;
|
2622
2776
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2623
2777
|
}
|
2624
2778
|
|
@@ -2653,7 +2807,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
2653
2807
|
|
2654
2808
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
2655
2809
|
#else
|
2656
|
-
|
2810
|
+
NO_DEVICE_CODE;
|
2657
2811
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2658
2812
|
}
|
2659
2813
|
|
@@ -2690,7 +2844,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
2690
2844
|
|
2691
2845
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
2692
2846
|
#else
|
2693
|
-
|
2847
|
+
NO_DEVICE_CODE;
|
2694
2848
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2695
2849
|
}
|
2696
2850
|
|
@@ -2730,7 +2884,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
2730
2884
|
|
2731
2885
|
return d3 * sumf;
|
2732
2886
|
#else
|
2733
|
-
|
2887
|
+
NO_DEVICE_CODE;
|
2734
2888
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2735
2889
|
}
|
2736
2890
|
|
@@ -2755,7 +2909,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
2755
2909
|
|
2756
2910
|
return d3*d8 * sumi;
|
2757
2911
|
#else
|
2758
|
-
|
2912
|
+
NO_DEVICE_CODE;
|
2759
2913
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2760
2914
|
}
|
2761
2915
|
|
@@ -2788,7 +2942,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
2788
2942
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2789
2943
|
|
2790
2944
|
#else
|
2791
|
-
|
2945
|
+
NO_DEVICE_CODE;
|
2792
2946
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2793
2947
|
}
|
2794
2948
|
|
@@ -2821,7 +2975,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
2821
2975
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2822
2976
|
|
2823
2977
|
#else
|
2824
|
-
|
2978
|
+
NO_DEVICE_CODE;
|
2825
2979
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2826
2980
|
}
|
2827
2981
|
|
@@ -2861,7 +3015,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|
2861
3015
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
2862
3016
|
|
2863
3017
|
#else
|
2864
|
-
|
3018
|
+
NO_DEVICE_CODE;
|
2865
3019
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2866
3020
|
}
|
2867
3021
|
|
@@ -2894,7 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|
2894
3048
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
2895
3049
|
|
2896
3050
|
#else
|
2897
|
-
|
3051
|
+
NO_DEVICE_CODE;
|
2898
3052
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2899
3053
|
}
|
2900
3054
|
|
@@ -2924,7 +3078,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
2924
3078
|
|
2925
3079
|
return d*sumf;
|
2926
3080
|
#else
|
2927
|
-
|
3081
|
+
NO_DEVICE_CODE;
|
2928
3082
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2929
3083
|
}
|
2930
3084
|
|
@@ -2955,7 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
2955
3109
|
return d6 * sumf_d;
|
2956
3110
|
|
2957
3111
|
#else
|
2958
|
-
|
3112
|
+
NO_DEVICE_CODE;
|
2959
3113
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2960
3114
|
}
|
2961
3115
|
|
@@ -3821,7 +3975,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3821
3975
|
return dall * sumf_d - dmin * sumf_m;
|
3822
3976
|
|
3823
3977
|
#else
|
3824
|
-
|
3978
|
+
NO_DEVICE_CODE;
|
3825
3979
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3826
3980
|
|
3827
3981
|
#endif
|
@@ -4004,7 +4158,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
4004
4158
|
return d * sumf_d;
|
4005
4159
|
|
4006
4160
|
#else
|
4007
|
-
|
4161
|
+
NO_DEVICE_CODE;
|
4008
4162
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
4009
4163
|
|
4010
4164
|
#endif
|
@@ -4262,7 +4416,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|
4262
4416
|
q8 += 8;
|
4263
4417
|
aux32 >>= 7;
|
4264
4418
|
}
|
4265
|
-
const float d = (float)bq2->d * (0.5f + aux32) * (
|
4419
|
+
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4266
4420
|
return d * sumi;
|
4267
4421
|
#else
|
4268
4422
|
// iqs is 0...15
|
@@ -4273,7 +4427,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|
4273
4427
|
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
4274
4428
|
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
4275
4429
|
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
4276
|
-
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (
|
4430
|
+
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4277
4431
|
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
4278
4432
|
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
4279
4433
|
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
@@ -4292,6 +4446,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|
4292
4446
|
|
4293
4447
|
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
4294
4448
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4449
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4295
4450
|
#if QK_K == 256
|
4296
4451
|
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
4297
4452
|
|
@@ -4302,28 +4457,69 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
4302
4457
|
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
4303
4458
|
int sumi1 = 0;
|
4304
4459
|
for (int l = 0; l < 2; ++l) {
|
4305
|
-
const
|
4306
|
-
const
|
4307
|
-
|
4308
|
-
|
4309
|
-
|
4460
|
+
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
4461
|
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
4462
|
+
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
4463
|
+
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
4464
|
+
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
4465
|
+
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
4310
4466
|
q8 += 8;
|
4311
4467
|
}
|
4312
4468
|
int sumi2 = 0;
|
4313
4469
|
for (int l = 2; l < 4; ++l) {
|
4314
|
-
const
|
4315
|
-
const
|
4316
|
-
|
4317
|
-
|
4318
|
-
|
4470
|
+
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
4471
|
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
4472
|
+
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
4473
|
+
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
4474
|
+
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
4475
|
+
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
4319
4476
|
q8 += 8;
|
4320
4477
|
}
|
4321
|
-
const float d = (float)bq2->d * (
|
4478
|
+
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
4322
4479
|
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
4323
4480
|
#else
|
4324
4481
|
assert(false);
|
4325
4482
|
return 0.f;
|
4326
4483
|
#endif
|
4484
|
+
#else
|
4485
|
+
assert(false);
|
4486
|
+
return 0.f;
|
4487
|
+
#endif
|
4488
|
+
}
|
4489
|
+
|
4490
|
+
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
4491
|
+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
4492
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
4493
|
+
#if QK_K == 256
|
4494
|
+
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
4495
|
+
|
4496
|
+
const int ib32 = iqs;
|
4497
|
+
const uint8_t * q3 = bq2->qs + 8*ib32;
|
4498
|
+
const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
|
4499
|
+
const int8_t * q8 = bq8_1[ib32].qs;
|
4500
|
+
uint32_t aux32 = gas[0] | (gas[1] << 16);
|
4501
|
+
int sumi = 0;
|
4502
|
+
for (int l = 0; l < 4; ++l) {
|
4503
|
+
const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
|
4504
|
+
const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
|
4505
|
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
|
4506
|
+
const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
|
4507
|
+
const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
|
4508
|
+
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
4509
|
+
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
4510
|
+
q8 += 8;
|
4511
|
+
aux32 >>= 7;
|
4512
|
+
}
|
4513
|
+
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
|
4514
|
+
return d * sumi;
|
4515
|
+
#else
|
4516
|
+
assert(false);
|
4517
|
+
return 0.f;
|
4518
|
+
#endif
|
4519
|
+
#else
|
4520
|
+
assert(false);
|
4521
|
+
return 0.f;
|
4522
|
+
#endif
|
4327
4523
|
}
|
4328
4524
|
|
4329
4525
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
@@ -4499,7 +4695,7 @@ template <bool need_check> static __global__ void
|
|
4499
4695
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4500
4696
|
#else
|
4501
4697
|
(void) vec_dot_q4_0_q8_1_mul_mat;
|
4502
|
-
|
4698
|
+
NO_DEVICE_CODE;
|
4503
4699
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4504
4700
|
}
|
4505
4701
|
|
@@ -4568,7 +4764,7 @@ template <bool need_check> static __global__ void
|
|
4568
4764
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4569
4765
|
#else
|
4570
4766
|
(void) vec_dot_q4_1_q8_1_mul_mat;
|
4571
|
-
|
4767
|
+
NO_DEVICE_CODE;
|
4572
4768
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4573
4769
|
}
|
4574
4770
|
|
@@ -4635,7 +4831,7 @@ template <bool need_check> static __global__ void
|
|
4635
4831
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4636
4832
|
#else
|
4637
4833
|
(void) vec_dot_q5_0_q8_1_mul_mat;
|
4638
|
-
|
4834
|
+
NO_DEVICE_CODE;
|
4639
4835
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4640
4836
|
}
|
4641
4837
|
|
@@ -4702,7 +4898,7 @@ mul_mat_q5_1(
|
|
4702
4898
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4703
4899
|
#else
|
4704
4900
|
(void) vec_dot_q5_1_q8_1_mul_mat;
|
4705
|
-
|
4901
|
+
NO_DEVICE_CODE;
|
4706
4902
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4707
4903
|
}
|
4708
4904
|
|
@@ -4769,7 +4965,7 @@ template <bool need_check> static __global__ void
|
|
4769
4965
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4770
4966
|
#else
|
4771
4967
|
(void) vec_dot_q8_0_q8_1_mul_mat;
|
4772
|
-
|
4968
|
+
NO_DEVICE_CODE;
|
4773
4969
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4774
4970
|
}
|
4775
4971
|
|
@@ -4836,7 +5032,7 @@ mul_mat_q2_K(
|
|
4836
5032
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4837
5033
|
#else
|
4838
5034
|
(void) vec_dot_q2_K_q8_1_mul_mat;
|
4839
|
-
|
5035
|
+
NO_DEVICE_CODE;
|
4840
5036
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4841
5037
|
}
|
4842
5038
|
|
@@ -4905,7 +5101,7 @@ template <bool need_check> static __global__ void
|
|
4905
5101
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4906
5102
|
#else
|
4907
5103
|
(void) vec_dot_q3_K_q8_1_mul_mat;
|
4908
|
-
|
5104
|
+
NO_DEVICE_CODE;
|
4909
5105
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4910
5106
|
}
|
4911
5107
|
|
@@ -4974,7 +5170,7 @@ template <bool need_check> static __global__ void
|
|
4974
5170
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4975
5171
|
#else
|
4976
5172
|
(void) vec_dot_q4_K_q8_1_mul_mat;
|
4977
|
-
|
5173
|
+
NO_DEVICE_CODE;
|
4978
5174
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
4979
5175
|
}
|
4980
5176
|
|
@@ -5041,7 +5237,7 @@ mul_mat_q5_K(
|
|
5041
5237
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
5042
5238
|
#else
|
5043
5239
|
(void) vec_dot_q5_K_q8_1_mul_mat;
|
5044
|
-
|
5240
|
+
NO_DEVICE_CODE;
|
5045
5241
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
5046
5242
|
}
|
5047
5243
|
|
@@ -5110,45 +5306,74 @@ template <bool need_check> static __global__ void
|
|
5110
5306
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
5111
5307
|
#else
|
5112
5308
|
(void) vec_dot_q6_K_q8_1_mul_mat;
|
5113
|
-
|
5309
|
+
NO_DEVICE_CODE;
|
5114
5310
|
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
5115
5311
|
}
|
5116
5312
|
|
5117
|
-
|
5118
|
-
|
5119
|
-
|
5313
|
+
#define MMVQ_NWARPS_NVIDIA 4
|
5314
|
+
#define MMVQ_NWARPS_AMD_RDNA2 1
|
5315
|
+
#define MMVQ_NWARPS_AMD_OLD 4
|
5120
5316
|
|
5121
|
-
|
5122
|
-
|
5123
|
-
|
5317
|
+
template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
5318
|
+
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5319
|
+
__launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
|
5320
|
+
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
5321
|
+
static __global__ void mul_mat_vec_q(
|
5322
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
5323
|
+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
|
5324
|
+
|
5325
|
+
const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
|
5326
|
+
|
5327
|
+
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
5328
|
+
const int row = blockIdx.x;
|
5124
5329
|
|
5125
|
-
const int
|
5126
|
-
const int
|
5330
|
+
const int blocks_per_row_x = ncols_x / qk;
|
5331
|
+
const int blocks_per_col_y = nrows_y / QK8_1;
|
5332
|
+
const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
|
5127
5333
|
|
5128
5334
|
// partial sum for each thread
|
5129
|
-
float tmp = 0.0f;
|
5335
|
+
float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
|
5130
5336
|
|
5131
5337
|
const block_q_t * x = (const block_q_t *) vx;
|
5132
5338
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
5133
5339
|
|
5134
|
-
for (int i =
|
5135
|
-
const int ibx = row*
|
5340
|
+
for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
|
5341
|
+
const int ibx = row*blocks_per_row_x + i; // x block index
|
5136
5342
|
|
5137
|
-
const int iby =
|
5343
|
+
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
5138
5344
|
|
5139
|
-
const int iqs = vdr * (
|
5345
|
+
const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
|
5140
5346
|
|
5141
|
-
|
5347
|
+
#pragma unroll
|
5348
|
+
for (int j = 0; j < ncols_y; ++j) {
|
5349
|
+
tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
|
5350
|
+
}
|
5142
5351
|
}
|
5143
5352
|
|
5144
|
-
|
5353
|
+
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
|
5354
|
+
if (threadIdx.y > 0) {
|
5145
5355
|
#pragma unroll
|
5146
|
-
|
5147
|
-
|
5356
|
+
for (int j = 0; j < ncols_y; ++j) {
|
5357
|
+
tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
|
5358
|
+
}
|
5359
|
+
}
|
5360
|
+
__syncthreads();
|
5361
|
+
if (threadIdx.y > 0) {
|
5362
|
+
return;
|
5148
5363
|
}
|
5149
5364
|
|
5150
|
-
|
5151
|
-
|
5365
|
+
// sum up partial sums and write back result
|
5366
|
+
#pragma unroll
|
5367
|
+
for (int j = 0; j < ncols_y; ++j) {
|
5368
|
+
#pragma unroll
|
5369
|
+
for (int i = 0; i < nwarps-1; ++i) {
|
5370
|
+
tmp[j] += tmp_shared[i][j][threadIdx.x];
|
5371
|
+
}
|
5372
|
+
tmp[j] = warp_reduce_sum(tmp[j]);
|
5373
|
+
|
5374
|
+
if (threadIdx.x == 0) {
|
5375
|
+
dst[j*nrows_dst + row] = tmp[j];
|
5376
|
+
}
|
5152
5377
|
}
|
5153
5378
|
}
|
5154
5379
|
|
@@ -5336,27 +5561,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
|
5336
5561
|
*dsti = *xi;
|
5337
5562
|
}
|
5338
5563
|
|
5564
|
+
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
5565
|
+
const half * xi = (const half *) cxi;
|
5566
|
+
float * dsti = (float *) cdsti;
|
5567
|
+
|
5568
|
+
*dsti = *xi;
|
5569
|
+
}
|
5570
|
+
|
5339
5571
|
template <cpy_kernel_t cpy_1>
|
5340
5572
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
5341
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5342
|
-
const int ne10, const int ne11, const int
|
5573
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
5574
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
5575
|
+
const int nb12, const int nb13) {
|
5343
5576
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
5344
5577
|
|
5345
5578
|
if (i >= ne) {
|
5346
5579
|
return;
|
5347
5580
|
}
|
5348
5581
|
|
5349
|
-
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
5582
|
+
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
5350
5583
|
// then combine those indices with the corresponding byte offsets to get the total offsets
|
5351
|
-
const int
|
5352
|
-
const int
|
5353
|
-
const int
|
5354
|
-
const int
|
5355
|
-
|
5356
|
-
|
5357
|
-
const int
|
5358
|
-
const int
|
5359
|
-
const int
|
5584
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
5585
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
5586
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
5587
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
5588
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
5589
|
+
|
5590
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
5591
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
5592
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
5593
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
5594
|
+
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
5360
5595
|
|
5361
5596
|
cpy_1(cx + x_offset, cdst + dst_offset);
|
5362
5597
|
}
|
@@ -5450,23 +5685,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
|
5450
5685
|
|
5451
5686
|
template <cpy_kernel_t cpy_blck, int qk>
|
5452
5687
|
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
5453
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5454
|
-
const int ne10, const int ne11, const int
|
5688
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
5689
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
5690
|
+
const int nb12, const int nb13) {
|
5455
5691
|
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
5456
5692
|
|
5457
5693
|
if (i >= ne) {
|
5458
5694
|
return;
|
5459
5695
|
}
|
5460
5696
|
|
5461
|
-
const int
|
5462
|
-
const int
|
5463
|
-
const int
|
5464
|
-
const int
|
5697
|
+
const int i03 = i/(ne00 * ne01 * ne02);
|
5698
|
+
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
5699
|
+
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
5700
|
+
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
5701
|
+
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
5465
5702
|
|
5466
|
-
const int
|
5467
|
-
const int
|
5468
|
-
const int
|
5469
|
-
const int
|
5703
|
+
const int i13 = i/(ne10 * ne11 * ne12);
|
5704
|
+
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
5705
|
+
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
5706
|
+
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
5707
|
+
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
5470
5708
|
|
5471
5709
|
cpy_blck(cx + x_offset, cdst + dst_offset);
|
5472
5710
|
}
|
@@ -5635,7 +5873,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|
5635
5873
|
}
|
5636
5874
|
|
5637
5875
|
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
5638
|
-
const int row = blockIdx.
|
5876
|
+
const int row = blockIdx.x;
|
5639
5877
|
const int col = threadIdx.x;
|
5640
5878
|
|
5641
5879
|
float sum = 0.0f;
|
@@ -5833,7 +6071,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|
5833
6071
|
}
|
5834
6072
|
#else
|
5835
6073
|
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
5836
|
-
|
6074
|
+
NO_DEVICE_CODE;
|
5837
6075
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
5838
6076
|
}
|
5839
6077
|
|
@@ -5957,9 +6195,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
5957
6195
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
5958
6196
|
}
|
5959
6197
|
|
5960
|
-
|
5961
|
-
|
5962
|
-
|
6198
|
+
template <typename T>
|
6199
|
+
static __global__ void im2col_kernel(
|
6200
|
+
const float * x, T * dst, int batch_offset,
|
6201
|
+
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
|
5963
6202
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
5964
6203
|
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
5965
6204
|
if (i >= pelements) {
|
@@ -5972,21 +6211,73 @@ static __global__ void im2col_f32_f16(
|
|
5972
6211
|
const int ky = (i - kd) / OW;
|
5973
6212
|
const int ix = i % OW;
|
5974
6213
|
|
6214
|
+
const int oh = blockIdx.y;
|
6215
|
+
const int batch = blockIdx.z / IC;
|
6216
|
+
const int ic = blockIdx.z % IC;
|
6217
|
+
|
5975
6218
|
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
5976
|
-
const int64_t iih =
|
6219
|
+
const int64_t iih = oh * s1 + ky * d1 - p1;
|
5977
6220
|
|
5978
6221
|
const int64_t offset_dst =
|
5979
|
-
(
|
5980
|
-
(
|
6222
|
+
((batch * OH + oh) * OW + ix) * CHW +
|
6223
|
+
(ic * (KW * KH) + ky * KW + kx);
|
5981
6224
|
|
5982
6225
|
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
5983
|
-
dst[offset_dst] =
|
6226
|
+
dst[offset_dst] = 0.0f;
|
5984
6227
|
} else {
|
5985
|
-
const int64_t offset_src =
|
5986
|
-
dst[offset_dst] =
|
6228
|
+
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
6229
|
+
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
5987
6230
|
}
|
5988
6231
|
}
|
5989
6232
|
|
6233
|
+
template <typename Ti, typename To>
|
6234
|
+
static __global__ void pool2d_nchw_kernel(
|
6235
|
+
const int ih, const int iw, const int oh, const int ow,
|
6236
|
+
const int kh, const int kw, const int sh, const int sw,
|
6237
|
+
const int ph, const int pw, const int parallel_elements,
|
6238
|
+
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
6239
|
+
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
6240
|
+
if (idx >= parallel_elements) {
|
6241
|
+
return;
|
6242
|
+
}
|
6243
|
+
|
6244
|
+
const int I_HW = ih * iw;
|
6245
|
+
const int O_HW = oh * ow;
|
6246
|
+
const int nc = idx / O_HW;
|
6247
|
+
const int cur_oh = idx % O_HW / ow;
|
6248
|
+
const int cur_ow = idx % O_HW % ow;
|
6249
|
+
const Ti* i_ptr = src + nc * I_HW;
|
6250
|
+
To* o_ptr = dst + nc * O_HW;
|
6251
|
+
const int start_h = cur_oh * sh - ph;
|
6252
|
+
const int bh = max(0, start_h);
|
6253
|
+
const int eh = min(ih, start_h + kh);
|
6254
|
+
const int start_w = cur_ow * sw - pw;
|
6255
|
+
const int bw = max(0, start_w);
|
6256
|
+
const int ew = min(iw, start_w + kw);
|
6257
|
+
const To scale = 1. / (kh * kw);
|
6258
|
+
To res = 0;
|
6259
|
+
|
6260
|
+
switch (op) {
|
6261
|
+
case GGML_OP_POOL_AVG: res = 0; break;
|
6262
|
+
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
6263
|
+
}
|
6264
|
+
|
6265
|
+
for (int i = bh; i < eh; i += 1) {
|
6266
|
+
for (int j = bw; j < ew; j += 1) {
|
6267
|
+
#if __CUDA_ARCH__ >= 350
|
6268
|
+
Ti cur = __ldg(i_ptr + i * iw + j);
|
6269
|
+
#else
|
6270
|
+
Ti cur = i_ptr[i * iw + j];
|
6271
|
+
#endif
|
6272
|
+
switch (op) {
|
6273
|
+
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
6274
|
+
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
6275
|
+
}
|
6276
|
+
}
|
6277
|
+
}
|
6278
|
+
o_ptr[cur_oh * ow + cur_ow] = res;
|
6279
|
+
}
|
6280
|
+
|
5990
6281
|
template<int qk, int qr, dequantize_kernel_t dq>
|
5991
6282
|
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5992
6283
|
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
@@ -6200,6 +6491,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
6200
6491
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
6201
6492
|
}
|
6202
6493
|
|
6494
|
+
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
6495
|
+
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
6496
|
+
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
6497
|
+
}
|
6498
|
+
|
6499
|
+
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
6500
|
+
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
|
6501
|
+
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
6502
|
+
}
|
6503
|
+
|
6203
6504
|
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
6204
6505
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
6205
6506
|
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
@@ -6360,6 +6661,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
|
6360
6661
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
6361
6662
|
}
|
6362
6663
|
|
6664
|
+
template<typename dst_t>
|
6665
|
+
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
6666
|
+
const int nb = k / QK_K;
|
6667
|
+
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
6668
|
+
}
|
6669
|
+
|
6363
6670
|
template <typename src_t, typename dst_t>
|
6364
6671
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
6365
6672
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
@@ -6397,6 +6704,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
6397
6704
|
return dequantize_row_iq2_xxs_cuda;
|
6398
6705
|
case GGML_TYPE_IQ2_XS:
|
6399
6706
|
return dequantize_row_iq2_xs_cuda;
|
6707
|
+
case GGML_TYPE_IQ3_XXS:
|
6708
|
+
return dequantize_row_iq3_xxs_cuda;
|
6400
6709
|
case GGML_TYPE_F32:
|
6401
6710
|
return convert_unary_cuda<float>;
|
6402
6711
|
default:
|
@@ -6430,6 +6739,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
6430
6739
|
return dequantize_row_iq2_xxs_cuda;
|
6431
6740
|
case GGML_TYPE_IQ2_XS:
|
6432
6741
|
return dequantize_row_iq2_xs_cuda;
|
6742
|
+
case GGML_TYPE_IQ3_XXS:
|
6743
|
+
return dequantize_row_iq3_xxs_cuda;
|
6433
6744
|
case GGML_TYPE_F16:
|
6434
6745
|
return convert_unary_cuda<half>;
|
6435
6746
|
default:
|
@@ -6534,112 +6845,75 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
6534
6845
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
6535
6846
|
}
|
6536
6847
|
|
6537
|
-
|
6538
|
-
|
6539
|
-
const
|
6540
|
-
const
|
6541
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6542
|
-
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
6543
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6544
|
-
}
|
6848
|
+
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
|
6849
|
+
static void mul_mat_vec_q_cuda(
|
6850
|
+
const void * vx, const void * vy, float * dst,
|
6851
|
+
const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
|
6545
6852
|
|
6546
|
-
|
6547
|
-
GGML_ASSERT(
|
6548
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6549
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6550
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6551
|
-
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
6552
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6553
|
-
}
|
6853
|
+
GGML_ASSERT(ncols_x % qk == 0);
|
6854
|
+
GGML_ASSERT(ncols_y <= 4);
|
6554
6855
|
|
6555
|
-
|
6556
|
-
|
6557
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6558
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6559
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6560
|
-
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
6561
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6562
|
-
}
|
6563
|
-
|
6564
|
-
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6565
|
-
GGML_ASSERT(ncols % QK5_1 == 0);
|
6566
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6567
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6568
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6569
|
-
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
6570
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6571
|
-
}
|
6572
|
-
|
6573
|
-
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6574
|
-
GGML_ASSERT(ncols % QK8_0 == 0);
|
6575
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6576
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6577
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6578
|
-
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
6579
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6580
|
-
}
|
6581
|
-
|
6582
|
-
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6583
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
6584
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6585
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6586
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6587
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
6588
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6589
|
-
}
|
6590
|
-
|
6591
|
-
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6592
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
6593
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6594
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6595
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6596
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
6597
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6598
|
-
}
|
6599
|
-
|
6600
|
-
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
6601
|
-
GGML_ASSERT(ncols % QK_K == 0);
|
6602
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6603
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6604
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6605
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
6606
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6607
|
-
}
|
6856
|
+
int id;
|
6857
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
6608
6858
|
|
6609
|
-
|
6610
|
-
|
6611
|
-
|
6612
|
-
|
6613
|
-
|
6614
|
-
|
6615
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6616
|
-
}
|
6859
|
+
int nwarps;
|
6860
|
+
if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
|
6861
|
+
nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
|
6862
|
+
} else {
|
6863
|
+
nwarps = MMVQ_NWARPS_NVIDIA;
|
6864
|
+
}
|
6617
6865
|
|
6618
|
-
|
6619
|
-
|
6620
|
-
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
6621
|
-
const dim3 block_nums(block_num_y, 1, 1);
|
6622
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6623
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
6624
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6625
|
-
}
|
6866
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
6867
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
6626
6868
|
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
6630
|
-
|
6631
|
-
|
6632
|
-
|
6633
|
-
|
6634
|
-
|
6869
|
+
switch (nwarps) {
|
6870
|
+
case 1: switch(ncols_y) {
|
6871
|
+
case 1:
|
6872
|
+
mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
|
6873
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6874
|
+
break;
|
6875
|
+
case 2:
|
6876
|
+
mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6877
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6878
|
+
break;
|
6879
|
+
case 3:
|
6880
|
+
mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6881
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6882
|
+
break;
|
6883
|
+
case 4:
|
6884
|
+
mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
|
6885
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6886
|
+
break;
|
6887
|
+
default:
|
6888
|
+
GGML_ASSERT(false);
|
6889
|
+
break;
|
6890
|
+
} break;
|
6891
|
+
case 4: switch(ncols_y) {
|
6892
|
+
case 1:
|
6893
|
+
mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
|
6894
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6895
|
+
break;
|
6896
|
+
case 2:
|
6897
|
+
mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
|
6898
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6899
|
+
break;
|
6900
|
+
case 3:
|
6901
|
+
mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
|
6902
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6903
|
+
break;
|
6904
|
+
case 4:
|
6905
|
+
mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
|
6906
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
|
6907
|
+
break;
|
6908
|
+
default:
|
6909
|
+
GGML_ASSERT(false);
|
6910
|
+
break;
|
6911
|
+
} break;
|
6635
6912
|
|
6636
|
-
|
6637
|
-
|
6638
|
-
|
6639
|
-
|
6640
|
-
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
6641
|
-
mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
6642
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
6913
|
+
default:
|
6914
|
+
GGML_ASSERT(false);
|
6915
|
+
break;
|
6916
|
+
}
|
6643
6917
|
}
|
6644
6918
|
|
6645
6919
|
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
@@ -7114,69 +7388,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
7114
7388
|
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
7115
7389
|
}
|
7116
7390
|
|
7391
|
+
|
7392
|
+
static void ggml_cpy_f16_f32_cuda(
|
7393
|
+
const char * cx, char * cdst, const int ne,
|
7394
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7395
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7396
|
+
|
7397
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
7398
|
+
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
7399
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7400
|
+
}
|
7401
|
+
|
7117
7402
|
static void ggml_cpy_f32_f32_cuda(
|
7118
7403
|
const char * cx, char * cdst, const int ne,
|
7119
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7120
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7404
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7405
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7121
7406
|
|
7122
7407
|
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
7123
7408
|
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
7124
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7409
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7125
7410
|
}
|
7126
7411
|
|
7127
7412
|
static void ggml_cpy_f32_f16_cuda(
|
7128
7413
|
const char * cx, char * cdst, const int ne,
|
7129
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7130
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7414
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7415
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7131
7416
|
|
7132
7417
|
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
7133
7418
|
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
7134
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7419
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7135
7420
|
}
|
7136
7421
|
|
7137
7422
|
static void ggml_cpy_f32_q8_0_cuda(
|
7138
7423
|
const char * cx, char * cdst, const int ne,
|
7139
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7140
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7424
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7425
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7141
7426
|
|
7142
7427
|
GGML_ASSERT(ne % QK8_0 == 0);
|
7143
7428
|
const int num_blocks = ne / QK8_0;
|
7144
7429
|
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
7145
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7430
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7146
7431
|
}
|
7147
7432
|
|
7148
7433
|
static void ggml_cpy_f32_q4_0_cuda(
|
7149
7434
|
const char * cx, char * cdst, const int ne,
|
7150
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7151
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7435
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7436
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7152
7437
|
|
7153
7438
|
GGML_ASSERT(ne % QK4_0 == 0);
|
7154
7439
|
const int num_blocks = ne / QK4_0;
|
7155
7440
|
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
7156
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7441
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7157
7442
|
}
|
7158
7443
|
|
7159
7444
|
static void ggml_cpy_f32_q4_1_cuda(
|
7160
7445
|
const char * cx, char * cdst, const int ne,
|
7161
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7162
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7446
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7447
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7163
7448
|
|
7164
7449
|
GGML_ASSERT(ne % QK4_1 == 0);
|
7165
7450
|
const int num_blocks = ne / QK4_1;
|
7166
7451
|
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
7167
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7452
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7168
7453
|
}
|
7169
7454
|
|
7170
7455
|
static void ggml_cpy_f16_f16_cuda(
|
7171
7456
|
const char * cx, char * cdst, const int ne,
|
7172
|
-
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
7173
|
-
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
7457
|
+
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
7458
|
+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
7174
7459
|
|
7175
7460
|
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
7176
7461
|
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
7177
|
-
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
7462
|
+
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
7178
7463
|
}
|
7179
7464
|
|
7465
|
+
|
7466
|
+
|
7180
7467
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
7181
7468
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
7182
7469
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -7255,7 +7542,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|
7255
7542
|
|
7256
7543
|
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
7257
7544
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
7258
|
-
const dim3 block_nums(
|
7545
|
+
const dim3 block_nums(nrows, 1, 1);
|
7259
7546
|
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
7260
7547
|
}
|
7261
7548
|
|
@@ -7367,14 +7654,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|
7367
7654
|
}
|
7368
7655
|
}
|
7369
7656
|
|
7370
|
-
|
7657
|
+
template <typename T>
|
7658
|
+
static void im2col_cuda(const float* x, T* dst,
|
7371
7659
|
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
7372
|
-
int offset_delta,
|
7660
|
+
int batch, int batch_offset, int offset_delta,
|
7373
7661
|
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
7374
7662
|
const int parallel_elements = OW * KW * KH;
|
7375
7663
|
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
7376
|
-
dim3 block_nums(num_blocks, OH, IC);
|
7377
|
-
|
7664
|
+
dim3 block_nums(num_blocks, OH, batch * IC);
|
7665
|
+
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
7378
7666
|
}
|
7379
7667
|
|
7380
7668
|
// buffer pool for cuda
|
@@ -7959,6 +8247,34 @@ static void ggml_cuda_op_relu(
|
|
7959
8247
|
(void) src1_dd;
|
7960
8248
|
}
|
7961
8249
|
|
8250
|
+
static void ggml_cuda_op_hardsigmoid(
|
8251
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8252
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8253
|
+
|
8254
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
8255
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
8256
|
+
|
8257
|
+
hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
8258
|
+
|
8259
|
+
(void) src1;
|
8260
|
+
(void) dst;
|
8261
|
+
(void) src1_dd;
|
8262
|
+
}
|
8263
|
+
|
8264
|
+
static void ggml_cuda_op_hardswish(
|
8265
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8266
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8267
|
+
|
8268
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
8269
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
8270
|
+
|
8271
|
+
hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
8272
|
+
|
8273
|
+
(void) src1;
|
8274
|
+
(void) dst;
|
8275
|
+
(void) src1_dd;
|
8276
|
+
}
|
8277
|
+
|
7962
8278
|
static void ggml_cuda_op_leaky_relu(
|
7963
8279
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
7964
8280
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
@@ -8114,7 +8430,7 @@ static void ggml_cuda_op_mul_mat_q(
|
|
8114
8430
|
CUDA_CHECK(cudaGetDevice(&id));
|
8115
8431
|
|
8116
8432
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
8117
|
-
// nrows_dst == nrows of the matrix that the
|
8433
|
+
// nrows_dst == nrows of the matrix that the kernel writes into
|
8118
8434
|
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
8119
8435
|
|
8120
8436
|
switch (src0->type) {
|
@@ -8192,6 +8508,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8192
8508
|
case GGML_TYPE_Q6_K:
|
8193
8509
|
case GGML_TYPE_IQ2_XXS:
|
8194
8510
|
case GGML_TYPE_IQ2_XS:
|
8511
|
+
case GGML_TYPE_IQ3_XXS:
|
8195
8512
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
8196
8513
|
default:
|
8197
8514
|
GGML_ASSERT(false);
|
@@ -8214,6 +8531,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|
8214
8531
|
case GGML_TYPE_Q5_K:
|
8215
8532
|
case GGML_TYPE_IQ2_XXS:
|
8216
8533
|
case GGML_TYPE_IQ2_XS:
|
8534
|
+
case GGML_TYPE_IQ3_XXS:
|
8217
8535
|
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
8218
8536
|
case GGML_TYPE_Q6_K:
|
8219
8537
|
return 64;
|
@@ -8243,47 +8561,73 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|
8243
8561
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
8244
8562
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
8245
8563
|
|
8246
|
-
GGML_ASSERT(ggml_nrows(src1) == 1);
|
8247
|
-
|
8248
8564
|
const int64_t ne00 = src0->ne[0];
|
8249
8565
|
const int64_t row_diff = row_high - row_low;
|
8250
8566
|
|
8567
|
+
const int64_t ne10 = src1->ne[0];
|
8568
|
+
GGML_ASSERT(ne10 % QK8_1 == 0);
|
8569
|
+
|
8570
|
+
const int64_t ne0 = dst->ne[0];
|
8571
|
+
|
8572
|
+
int id;
|
8573
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
8574
|
+
|
8575
|
+
// the main device has a larger memory buffer to hold the results from all GPUs
|
8576
|
+
// nrows_dst == nrows of the matrix that the kernel writes into
|
8577
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
8578
|
+
|
8251
8579
|
switch (src0->type) {
|
8252
8580
|
case GGML_TYPE_Q4_0:
|
8253
|
-
|
8581
|
+
mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
8582
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8254
8583
|
break;
|
8255
8584
|
case GGML_TYPE_Q4_1:
|
8256
|
-
|
8585
|
+
mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
8586
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8257
8587
|
break;
|
8258
8588
|
case GGML_TYPE_Q5_0:
|
8259
|
-
|
8589
|
+
mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
8590
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8260
8591
|
break;
|
8261
8592
|
case GGML_TYPE_Q5_1:
|
8262
|
-
|
8593
|
+
mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
8594
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8263
8595
|
break;
|
8264
8596
|
case GGML_TYPE_Q8_0:
|
8265
|
-
|
8597
|
+
mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
8598
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8266
8599
|
break;
|
8267
8600
|
case GGML_TYPE_Q2_K:
|
8268
|
-
|
8601
|
+
mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
8602
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8269
8603
|
break;
|
8270
8604
|
case GGML_TYPE_Q3_K:
|
8271
|
-
|
8605
|
+
mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
8606
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8272
8607
|
break;
|
8273
8608
|
case GGML_TYPE_Q4_K:
|
8274
|
-
|
8609
|
+
mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
8610
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8275
8611
|
break;
|
8276
8612
|
case GGML_TYPE_Q5_K:
|
8277
|
-
|
8613
|
+
mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
8614
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8278
8615
|
break;
|
8279
8616
|
case GGML_TYPE_Q6_K:
|
8280
|
-
|
8617
|
+
mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
8618
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8281
8619
|
break;
|
8282
8620
|
case GGML_TYPE_IQ2_XXS:
|
8283
|
-
|
8621
|
+
mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
|
8622
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8284
8623
|
break;
|
8285
8624
|
case GGML_TYPE_IQ2_XS:
|
8286
|
-
|
8625
|
+
mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
|
8626
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8627
|
+
break;
|
8628
|
+
case GGML_TYPE_IQ3_XXS:
|
8629
|
+
mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
8630
|
+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
8287
8631
|
break;
|
8288
8632
|
default:
|
8289
8633
|
GGML_ASSERT(false);
|
@@ -8319,9 +8663,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
8319
8663
|
|
8320
8664
|
if (src1_convert_f16) {
|
8321
8665
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
8322
|
-
|
8323
|
-
|
8324
|
-
|
8666
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
8667
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
8668
|
+
to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
|
8325
8669
|
}
|
8326
8670
|
#else
|
8327
8671
|
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
@@ -8585,13 +8929,46 @@ static void ggml_cuda_op_alibi(
|
|
8585
8929
|
(void) src1_dd;
|
8586
8930
|
}
|
8587
8931
|
|
8932
|
+
static void ggml_cuda_op_pool2d(
|
8933
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8934
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8935
|
+
|
8936
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
8937
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
8938
|
+
|
8939
|
+
const int32_t * opts = (const int32_t *)dst->op_params;
|
8940
|
+
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
8941
|
+
const int k0 = opts[1];
|
8942
|
+
const int k1 = opts[2];
|
8943
|
+
const int s0 = opts[3];
|
8944
|
+
const int s1 = opts[4];
|
8945
|
+
const int p0 = opts[5];
|
8946
|
+
const int p1 = opts[6];
|
8947
|
+
|
8948
|
+
const int64_t IH = src0->ne[1];
|
8949
|
+
const int64_t IW = src0->ne[0];
|
8950
|
+
|
8951
|
+
const int64_t N = dst->ne[3];
|
8952
|
+
const int64_t OC = dst->ne[2];
|
8953
|
+
const int64_t OH = dst->ne[1];
|
8954
|
+
const int64_t OW = dst->ne[0];
|
8955
|
+
|
8956
|
+
const int parallel_elements = N * OC * OH * OW;
|
8957
|
+
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
8958
|
+
dim3 block_nums(num_blocks);
|
8959
|
+
pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
|
8960
|
+
|
8961
|
+
(void) src1;
|
8962
|
+
(void) src1_dd;
|
8963
|
+
}
|
8964
|
+
|
8588
8965
|
static void ggml_cuda_op_im2col(
|
8589
8966
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
8590
8967
|
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
8591
8968
|
|
8592
8969
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
8593
8970
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
8594
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
8971
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
8595
8972
|
|
8596
8973
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
8597
8974
|
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
@@ -8613,8 +8990,14 @@ static void ggml_cuda_op_im2col(
|
|
8613
8990
|
const int64_t OW = dst->ne[1];
|
8614
8991
|
|
8615
8992
|
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
8993
|
+
const int64_t batch = src1->ne[3];
|
8994
|
+
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
8616
8995
|
|
8617
|
-
|
8996
|
+
if(dst->type == GGML_TYPE_F16) {
|
8997
|
+
im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
8998
|
+
} else {
|
8999
|
+
im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
9000
|
+
}
|
8618
9001
|
|
8619
9002
|
(void) src0;
|
8620
9003
|
(void) src0_dd;
|
@@ -9210,6 +9593,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
9210
9593
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
9211
9594
|
}
|
9212
9595
|
|
9596
|
+
static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9597
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
|
9598
|
+
}
|
9599
|
+
|
9600
|
+
static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9601
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
|
9602
|
+
}
|
9213
9603
|
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9214
9604
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
9215
9605
|
}
|
@@ -9561,17 +9951,18 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
9561
9951
|
#ifdef GGML_CUDA_FORCE_DMMV
|
9562
9952
|
const bool use_mul_mat_vec_q = false;
|
9563
9953
|
#else
|
9564
|
-
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type)
|
9954
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
9565
9955
|
#endif // GGML_CUDA_FORCE_DMMV
|
9566
9956
|
|
9567
9957
|
if (use_mul_mat_vec_q) {
|
9568
|
-
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
9569
9958
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9570
9959
|
} else {
|
9571
9960
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
9572
9961
|
}
|
9573
9962
|
} else {
|
9574
|
-
if (
|
9963
|
+
if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
|
9964
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
9965
|
+
} else if (use_mul_mat_q) {
|
9575
9966
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
9576
9967
|
} else {
|
9577
9968
|
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
@@ -9769,8 +10160,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|
9769
10160
|
// TODO: mmq/mmv support
|
9770
10161
|
#endif
|
9771
10162
|
|
9772
|
-
const
|
9773
|
-
const
|
10163
|
+
const size_t nb11 = src1->nb[1];
|
10164
|
+
const size_t nb1 = dst->nb[1];
|
9774
10165
|
|
9775
10166
|
const struct ggml_tensor * ids = src0;
|
9776
10167
|
const int32_t id = ((int32_t *) dst->op_params)[0];
|
@@ -9920,19 +10311,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
9920
10311
|
|
9921
10312
|
const int64_t ne00 = src0->ne[0];
|
9922
10313
|
const int64_t ne01 = src0->ne[1];
|
9923
|
-
|
10314
|
+
const int64_t ne02 = src0->ne[2];
|
10315
|
+
|
10316
|
+
//GGML_ASSERT(src0->ne[3] == 1);
|
9924
10317
|
|
9925
10318
|
const int64_t nb00 = src0->nb[0];
|
9926
10319
|
const int64_t nb01 = src0->nb[1];
|
9927
10320
|
const int64_t nb02 = src0->nb[2];
|
10321
|
+
const int64_t nb03 = src0->nb[3];
|
9928
10322
|
|
9929
10323
|
const int64_t ne10 = src1->ne[0];
|
9930
10324
|
const int64_t ne11 = src1->ne[1];
|
9931
|
-
|
10325
|
+
const int64_t ne12 = src1->ne[2];
|
10326
|
+
|
10327
|
+
//GGML_ASSERT(src1->ne[3] == 1);
|
9932
10328
|
|
9933
10329
|
const int64_t nb10 = src1->nb[0];
|
9934
10330
|
const int64_t nb11 = src1->nb[1];
|
9935
10331
|
const int64_t nb12 = src1->nb[2];
|
10332
|
+
const int64_t nb13 = src1->nb[3];
|
9936
10333
|
|
9937
10334
|
ggml_cuda_set_device(g_main_device);
|
9938
10335
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
@@ -9944,17 +10341,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
9944
10341
|
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
9945
10342
|
|
9946
10343
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
9947
|
-
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10344
|
+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9948
10345
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
9949
|
-
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10346
|
+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9950
10347
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
9951
|
-
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10348
|
+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9952
10349
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
9953
|
-
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10350
|
+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9954
10351
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
9955
|
-
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10352
|
+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9956
10353
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
9957
|
-
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
10354
|
+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
10355
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
10356
|
+
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
9958
10357
|
} else {
|
9959
10358
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
9960
10359
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -9987,6 +10386,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
9987
10386
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
9988
10387
|
}
|
9989
10388
|
|
10389
|
+
static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
10390
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
|
10391
|
+
}
|
10392
|
+
|
9990
10393
|
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
9991
10394
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
9992
10395
|
}
|
@@ -10088,6 +10491,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10088
10491
|
case GGML_UNARY_OP_RELU:
|
10089
10492
|
func = ggml_cuda_relu;
|
10090
10493
|
break;
|
10494
|
+
case GGML_UNARY_OP_HARDSIGMOID:
|
10495
|
+
func = ggml_cuda_hardsigmoid;
|
10496
|
+
break;
|
10497
|
+
case GGML_UNARY_OP_HARDSWISH:
|
10498
|
+
func = ggml_cuda_hardswish;
|
10499
|
+
break;
|
10091
10500
|
default:
|
10092
10501
|
return false;
|
10093
10502
|
}
|
@@ -10162,6 +10571,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|
10162
10571
|
case GGML_OP_IM2COL:
|
10163
10572
|
func = ggml_cuda_im2col;
|
10164
10573
|
break;
|
10574
|
+
case GGML_OP_POOL_2D:
|
10575
|
+
func = ggml_cuda_pool2d;
|
10576
|
+
break;
|
10165
10577
|
case GGML_OP_SUM_ROWS:
|
10166
10578
|
func = ggml_cuda_sum_rows;
|
10167
10579
|
break;
|
@@ -10283,15 +10695,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|
10283
10695
|
|
10284
10696
|
if (ggml_is_quantized(tensor->type)) {
|
10285
10697
|
// initialize padding to 0 to avoid possible NaN values
|
10286
|
-
|
10287
|
-
int64_t row_high = ggml_nrows(tensor);
|
10288
|
-
int64_t nrows_split = row_high - row_low;
|
10289
|
-
|
10290
|
-
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
10698
|
+
size_t original_size = ggml_nbytes(tensor);
|
10291
10699
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
10292
10700
|
|
10293
10701
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
10294
|
-
CUDA_CHECK(
|
10702
|
+
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
10295
10703
|
}
|
10296
10704
|
}
|
10297
10705
|
}
|
@@ -10394,12 +10802,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
|
|
10394
10802
|
}
|
10395
10803
|
|
10396
10804
|
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
10397
|
-
|
10398
|
-
int64_t row_high = ggml_nrows(tensor);
|
10399
|
-
int64_t nrows_split = row_high - row_low;
|
10400
|
-
|
10401
|
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
10402
|
-
|
10805
|
+
size_t size = ggml_nbytes(tensor);
|
10403
10806
|
int64_t ne0 = tensor->ne[0];
|
10404
10807
|
|
10405
10808
|
if (ggml_is_quantized(tensor->type)) {
|
@@ -10428,6 +10831,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
10428
10831
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
10429
10832
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
10430
10833
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
10834
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
10431
10835
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
10432
10836
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
10433
10837
|
/* .is_host = */ NULL,
|
@@ -10703,6 +11107,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|
10703
11107
|
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
10704
11108
|
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
10705
11109
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
11110
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
10706
11111
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
10707
11112
|
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
10708
11113
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
@@ -10782,6 +11187,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
10782
11187
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
10783
11188
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
10784
11189
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
11190
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
10785
11191
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
10786
11192
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
10787
11193
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
@@ -10896,6 +11302,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
10896
11302
|
case GGML_UNARY_OP_GELU:
|
10897
11303
|
case GGML_UNARY_OP_SILU:
|
10898
11304
|
case GGML_UNARY_OP_RELU:
|
11305
|
+
case GGML_UNARY_OP_HARDSIGMOID:
|
11306
|
+
case GGML_UNARY_OP_HARDSWISH:
|
10899
11307
|
case GGML_UNARY_OP_GELU_QUICK:
|
10900
11308
|
case GGML_UNARY_OP_TANH:
|
10901
11309
|
return true;
|
@@ -10918,6 +11326,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
10918
11326
|
if (a->ne[3] != b->ne[3]) {
|
10919
11327
|
return false;
|
10920
11328
|
}
|
11329
|
+
ggml_type a_type = a->type;
|
11330
|
+
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
|
11331
|
+
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
11332
|
+
return false;
|
11333
|
+
}
|
11334
|
+
}
|
10921
11335
|
return true;
|
10922
11336
|
} break;
|
10923
11337
|
case GGML_OP_GET_ROWS:
|
@@ -10957,6 +11371,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
10957
11371
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
10958
11372
|
return true;
|
10959
11373
|
}
|
11374
|
+
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
11375
|
+
return true;
|
11376
|
+
}
|
10960
11377
|
return false;
|
10961
11378
|
} break;
|
10962
11379
|
case GGML_OP_DUP:
|
@@ -10985,6 +11402,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
10985
11402
|
case GGML_OP_ROPE:
|
10986
11403
|
case GGML_OP_ALIBI:
|
10987
11404
|
case GGML_OP_IM2COL:
|
11405
|
+
case GGML_OP_POOL_2D:
|
10988
11406
|
case GGML_OP_SUM_ROWS:
|
10989
11407
|
case GGML_OP_ARGSORT:
|
10990
11408
|
case GGML_OP_ACC:
|