llama_cpp 0.12.3 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,9 +12,10 @@
12
12
  #include <vector>
13
13
  #include <map>
14
14
  #include <array>
15
- #include "ggml-cuda.h"
16
- #include "ggml.h"
17
- #include "ggml-backend-impl.h"
15
+
16
+ // stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
17
+ #define STRINGIZE_IMPL(...) #__VA_ARGS__
18
+ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
18
19
 
19
20
  #if defined(GGML_USE_HIPBLAS)
20
21
  #include <hip/hip_runtime.h>
@@ -118,6 +119,11 @@
118
119
 
119
120
  #endif // defined(GGML_USE_HIPBLAS)
120
121
 
122
+ // ggml-cuda need half type so keep ggml headers include at last
123
+ #include "ggml-cuda.h"
124
+ #include "ggml.h"
125
+ #include "ggml-backend-impl.h"
126
+
121
127
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
122
128
 
123
129
  #define CC_PASCAL 600
@@ -185,6 +191,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
185
191
  #endif // __has_builtin(__builtin_elementwise_sub_sat)
186
192
  }
187
193
 
194
+ static __device__ __forceinline__ int __vsub4(const int a, const int b) {
195
+ return __vsubss4(a, b);
196
+ }
197
+
188
198
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
189
199
  #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
190
200
  c = __builtin_amdgcn_sdot4(a, b, c, false);
@@ -499,6 +509,14 @@ typedef struct {
499
509
  } block_iq2_xs;
500
510
  static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
501
511
 
512
+ #define QR3_XXS 8
513
+ #define QI3_XXS (QK_K / (4*QR3_XXS))
514
+ typedef struct {
515
+ half d;
516
+ uint8_t qs[3*(QK_K/8)];
517
+ } block_iq3_xxs;
518
+ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
519
+
502
520
  #define WARP_SIZE 32
503
521
  #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
504
522
 
@@ -506,6 +524,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
506
524
  #define CUDA_SILU_BLOCK_SIZE 256
507
525
  #define CUDA_TANH_BLOCK_SIZE 256
508
526
  #define CUDA_RELU_BLOCK_SIZE 256
527
+ #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
528
+ #define CUDA_HARDSWISH_BLOCK_SIZE 256
509
529
  #define CUDA_SQR_BLOCK_SIZE 256
510
530
  #define CUDA_CPY_BLOCK_SIZE 32
511
531
  #define CUDA_SCALE_BLOCK_SIZE 256
@@ -522,6 +542,7 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
522
542
  #define CUDA_PAD_BLOCK_SIZE 256
523
543
  #define CUDA_ACC_BLOCK_SIZE 256
524
544
  #define CUDA_IM2COL_BLOCK_SIZE 256
545
+ #define CUDA_POOL2D_BLOCK_SIZE 256
525
546
 
526
547
  #define CUDA_Q8_0_NE_ALIGN 2048
527
548
 
@@ -582,13 +603,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
582
603
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
583
604
 
584
605
  [[noreturn]]
585
- static __device__ void bad_arch() {
586
- printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
606
+ static __device__ void no_device_code(
607
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
608
+
609
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
610
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
611
+ file_name, line, function_name, arch);
612
+ (void) arch_list;
613
+ #else
614
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
615
+ file_name, line, function_name, arch, arch_list);
616
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
587
617
  __trap();
588
618
 
589
- (void) bad_arch; // suppress unused function warning
619
+ (void) no_device_code; // suppress unused function warning
590
620
  }
591
621
 
622
+ #ifdef __CUDA_ARCH__
623
+ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
624
+ #else
625
+ #define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
626
+ #endif // __CUDA_ARCH__
627
+
592
628
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
593
629
  #pragma unroll
594
630
  for (int mask = 16; mask > 0; mask >>= 1) {
@@ -615,7 +651,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
615
651
  return a;
616
652
  #else
617
653
  (void) a;
618
- bad_arch();
654
+ NO_DEVICE_CODE;
619
655
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
620
656
  }
621
657
 
@@ -636,7 +672,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
636
672
  return x;
637
673
  #else
638
674
  (void) x;
639
- bad_arch();
675
+ NO_DEVICE_CODE;
640
676
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
641
677
  }
642
678
 
@@ -790,6 +826,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
790
826
  dst[i] = fmaxf(x[i], 0);
791
827
  }
792
828
 
829
+ static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
830
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
831
+
832
+ if (i >= k) {
833
+ return;
834
+ }
835
+ dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
836
+ }
837
+
838
+ static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
839
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
840
+
841
+ if (i >= k) {
842
+ return;
843
+ }
844
+ dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
845
+ }
846
+
793
847
  static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
794
848
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
795
849
  if (i >= k) {
@@ -1592,6 +1646,41 @@ static const __device__ uint64_t iq2xs_grid[512] = {
1592
1646
  0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
1593
1647
  };
1594
1648
 
1649
+ static const __device__ uint32_t iq3xxs_grid[256] = {
1650
+ 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
1651
+ 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
1652
+ 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
1653
+ 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
1654
+ 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
1655
+ 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
1656
+ 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
1657
+ 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
1658
+ 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
1659
+ 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
1660
+ 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
1661
+ 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
1662
+ 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
1663
+ 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
1664
+ 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
1665
+ 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
1666
+ 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
1667
+ 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
1668
+ 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
1669
+ 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
1670
+ 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
1671
+ 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
1672
+ 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
1673
+ 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
1674
+ 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
1675
+ 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
1676
+ 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
1677
+ 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
1678
+ 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
1679
+ 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
1680
+ 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
1681
+ 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
1682
+ };
1683
+
1595
1684
  static const __device__ uint8_t ksigns_iq2xs[128] = {
1596
1685
  0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
1597
1686
  144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
@@ -1603,6 +1692,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = {
1603
1692
  240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
1604
1693
  };
1605
1694
 
1695
+ //#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1696
+ static const __device__ uint64_t ksigns64[128] = {
1697
+ 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
1698
+ 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
1699
+ 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
1700
+ 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
1701
+ 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
1702
+ 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
1703
+ 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
1704
+ 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
1705
+ 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
1706
+ 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
1707
+ 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
1708
+ 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
1709
+ 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
1710
+ 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
1711
+ 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
1712
+ 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
1713
+ 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
1714
+ 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
1715
+ 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
1716
+ 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
1717
+ 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
1718
+ 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
1719
+ 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
1720
+ 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
1721
+ 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
1722
+ 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
1723
+ 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
1724
+ 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
1725
+ 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
1726
+ 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
1727
+ 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
1728
+ 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
1729
+ };
1730
+ //#endif
1731
+
1606
1732
  static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
1607
1733
 
1608
1734
  inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
@@ -1669,6 +1795,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
1669
1795
 
1670
1796
  }
1671
1797
 
1798
+ template<typename dst_t>
1799
+ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
1800
+
1801
+ const int i = blockIdx.x;
1802
+ const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
1803
+
1804
+ const int tid = threadIdx.x;
1805
+ #if QK_K == 256
1806
+ const int il = tid/8; // 0...3
1807
+ const int ib = tid%8; // 0...7
1808
+ dst_t * y = yy + i*QK_K + 32*ib + 8*il;
1809
+ const uint8_t * q3 = x[i].qs + 8*ib;
1810
+ const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
1811
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
1812
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
1813
+ const uint32_t aux32 = gas[0] | (gas[1] << 16);
1814
+ const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
1815
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
1816
+ for (int j = 0; j < 4; ++j) {
1817
+ y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
1818
+ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
1819
+ }
1820
+ #else
1821
+ assert(false);
1822
+ #endif
1823
+
1824
+ }
1825
+
1672
1826
  static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
1673
1827
 
1674
1828
  static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -2419,7 +2573,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
2419
2573
  }
2420
2574
  #else
2421
2575
  (void) vx; (void) y; (void) k;
2422
- bad_arch();
2576
+ NO_DEVICE_CODE;
2423
2577
  #endif // __CUDA_ARCH__ >= CC_PASCAL
2424
2578
  }
2425
2579
 
@@ -2450,7 +2604,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
2450
2604
  // second part effectively subtracts 8 from each quant value
2451
2605
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
2452
2606
  #else
2453
- bad_arch();
2607
+ NO_DEVICE_CODE;
2454
2608
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2455
2609
  }
2456
2610
 
@@ -2487,7 +2641,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2487
2641
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2488
2642
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2489
2643
  #else
2490
- bad_arch();
2644
+ NO_DEVICE_CODE;
2491
2645
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2492
2646
  }
2493
2647
 
@@ -2522,7 +2676,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2522
2676
  // second part effectively subtracts 16 from each quant value
2523
2677
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
2524
2678
  #else
2525
- bad_arch();
2679
+ NO_DEVICE_CODE;
2526
2680
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2527
2681
  }
2528
2682
 
@@ -2567,7 +2721,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2567
2721
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2568
2722
 
2569
2723
  #else
2570
- bad_arch();
2724
+ NO_DEVICE_CODE;
2571
2725
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2572
2726
  }
2573
2727
 
@@ -2588,7 +2742,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2588
2742
 
2589
2743
  return d8_0*d8_1 * sumi;
2590
2744
  #else
2591
- bad_arch();
2745
+ NO_DEVICE_CODE;
2592
2746
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2593
2747
  }
2594
2748
 
@@ -2618,7 +2772,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2618
2772
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2619
2773
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2620
2774
  #else
2621
- bad_arch();
2775
+ NO_DEVICE_CODE;
2622
2776
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2623
2777
  }
2624
2778
 
@@ -2653,7 +2807,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2653
2807
 
2654
2808
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
2655
2809
  #else
2656
- bad_arch();
2810
+ NO_DEVICE_CODE;
2657
2811
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2658
2812
  }
2659
2813
 
@@ -2690,7 +2844,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2690
2844
 
2691
2845
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
2692
2846
  #else
2693
- bad_arch();
2847
+ NO_DEVICE_CODE;
2694
2848
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2695
2849
  }
2696
2850
 
@@ -2730,7 +2884,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2730
2884
 
2731
2885
  return d3 * sumf;
2732
2886
  #else
2733
- bad_arch();
2887
+ NO_DEVICE_CODE;
2734
2888
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2735
2889
  }
2736
2890
 
@@ -2755,7 +2909,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2755
2909
 
2756
2910
  return d3*d8 * sumi;
2757
2911
  #else
2758
- bad_arch();
2912
+ NO_DEVICE_CODE;
2759
2913
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2760
2914
  }
2761
2915
 
@@ -2788,7 +2942,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2788
2942
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2789
2943
 
2790
2944
  #else
2791
- bad_arch();
2945
+ NO_DEVICE_CODE;
2792
2946
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2793
2947
  }
2794
2948
 
@@ -2821,7 +2975,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2821
2975
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2822
2976
 
2823
2977
  #else
2824
- bad_arch();
2978
+ NO_DEVICE_CODE;
2825
2979
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2826
2980
  }
2827
2981
 
@@ -2861,7 +3015,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2861
3015
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
2862
3016
 
2863
3017
  #else
2864
- bad_arch();
3018
+ NO_DEVICE_CODE;
2865
3019
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2866
3020
  }
2867
3021
 
@@ -2894,7 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2894
3048
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
2895
3049
 
2896
3050
  #else
2897
- bad_arch();
3051
+ NO_DEVICE_CODE;
2898
3052
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2899
3053
  }
2900
3054
 
@@ -2924,7 +3078,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2924
3078
 
2925
3079
  return d*sumf;
2926
3080
  #else
2927
- bad_arch();
3081
+ NO_DEVICE_CODE;
2928
3082
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2929
3083
  }
2930
3084
 
@@ -2955,7 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2955
3109
  return d6 * sumf_d;
2956
3110
 
2957
3111
  #else
2958
- bad_arch();
3112
+ NO_DEVICE_CODE;
2959
3113
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2960
3114
  }
2961
3115
 
@@ -3821,7 +3975,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3821
3975
  return dall * sumf_d - dmin * sumf_m;
3822
3976
 
3823
3977
  #else
3824
- bad_arch();
3978
+ NO_DEVICE_CODE;
3825
3979
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3826
3980
 
3827
3981
  #endif
@@ -4004,7 +4158,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
4004
4158
  return d * sumf_d;
4005
4159
 
4006
4160
  #else
4007
- bad_arch();
4161
+ NO_DEVICE_CODE;
4008
4162
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
4009
4163
 
4010
4164
  #endif
@@ -4262,7 +4416,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
4262
4416
  q8 += 8;
4263
4417
  aux32 >>= 7;
4264
4418
  }
4265
- const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
4419
+ const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
4266
4420
  return d * sumi;
4267
4421
  #else
4268
4422
  // iqs is 0...15
@@ -4273,7 +4427,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
4273
4427
  const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
4274
4428
  const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
4275
4429
  const uint32_t aux32 = q2[2] | (q2[3] << 16);
4276
- const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
4430
+ const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
4277
4431
  const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
4278
4432
  const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
4279
4433
  const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@@ -4292,6 +4446,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
4292
4446
 
4293
4447
  static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4294
4448
  const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4449
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4295
4450
  #if QK_K == 256
4296
4451
  const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
4297
4452
 
@@ -4302,28 +4457,69 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
4302
4457
  const uint8_t ls2 = bq2->scales[ib32] >> 4;
4303
4458
  int sumi1 = 0;
4304
4459
  for (int l = 0; l < 2; ++l) {
4305
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
4306
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
4307
- for (int j = 0; j < 8; ++j) {
4308
- sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
4309
- }
4460
+ const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
4461
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
4462
+ const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
4463
+ const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
4464
+ sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
4465
+ sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
4310
4466
  q8 += 8;
4311
4467
  }
4312
4468
  int sumi2 = 0;
4313
4469
  for (int l = 2; l < 4; ++l) {
4314
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
4315
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
4316
- for (int j = 0; j < 8; ++j) {
4317
- sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
4318
- }
4470
+ const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
4471
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
4472
+ const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
4473
+ const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
4474
+ sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
4475
+ sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
4319
4476
  q8 += 8;
4320
4477
  }
4321
- const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
4478
+ const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
4322
4479
  return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
4323
4480
  #else
4324
4481
  assert(false);
4325
4482
  return 0.f;
4326
4483
  #endif
4484
+ #else
4485
+ assert(false);
4486
+ return 0.f;
4487
+ #endif
4488
+ }
4489
+
4490
+ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
4491
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
4492
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
4493
+ #if QK_K == 256
4494
+ const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
4495
+
4496
+ const int ib32 = iqs;
4497
+ const uint8_t * q3 = bq2->qs + 8*ib32;
4498
+ const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
4499
+ const int8_t * q8 = bq8_1[ib32].qs;
4500
+ uint32_t aux32 = gas[0] | (gas[1] << 16);
4501
+ int sumi = 0;
4502
+ for (int l = 0; l < 4; ++l) {
4503
+ const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
4504
+ const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
4505
+ const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
4506
+ const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
4507
+ const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
4508
+ sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
4509
+ sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
4510
+ q8 += 8;
4511
+ aux32 >>= 7;
4512
+ }
4513
+ const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
4514
+ return d * sumi;
4515
+ #else
4516
+ assert(false);
4517
+ return 0.f;
4518
+ #endif
4519
+ #else
4520
+ assert(false);
4521
+ return 0.f;
4522
+ #endif
4327
4523
  }
4328
4524
 
4329
4525
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
@@ -4499,7 +4695,7 @@ template <bool need_check> static __global__ void
4499
4695
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4500
4696
  #else
4501
4697
  (void) vec_dot_q4_0_q8_1_mul_mat;
4502
- bad_arch();
4698
+ NO_DEVICE_CODE;
4503
4699
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4504
4700
  }
4505
4701
 
@@ -4568,7 +4764,7 @@ template <bool need_check> static __global__ void
4568
4764
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4569
4765
  #else
4570
4766
  (void) vec_dot_q4_1_q8_1_mul_mat;
4571
- bad_arch();
4767
+ NO_DEVICE_CODE;
4572
4768
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4573
4769
  }
4574
4770
 
@@ -4635,7 +4831,7 @@ template <bool need_check> static __global__ void
4635
4831
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4636
4832
  #else
4637
4833
  (void) vec_dot_q5_0_q8_1_mul_mat;
4638
- bad_arch();
4834
+ NO_DEVICE_CODE;
4639
4835
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4640
4836
  }
4641
4837
 
@@ -4702,7 +4898,7 @@ mul_mat_q5_1(
4702
4898
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4703
4899
  #else
4704
4900
  (void) vec_dot_q5_1_q8_1_mul_mat;
4705
- bad_arch();
4901
+ NO_DEVICE_CODE;
4706
4902
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4707
4903
  }
4708
4904
 
@@ -4769,7 +4965,7 @@ template <bool need_check> static __global__ void
4769
4965
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4770
4966
  #else
4771
4967
  (void) vec_dot_q8_0_q8_1_mul_mat;
4772
- bad_arch();
4968
+ NO_DEVICE_CODE;
4773
4969
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4774
4970
  }
4775
4971
 
@@ -4836,7 +5032,7 @@ mul_mat_q2_K(
4836
5032
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4837
5033
  #else
4838
5034
  (void) vec_dot_q2_K_q8_1_mul_mat;
4839
- bad_arch();
5035
+ NO_DEVICE_CODE;
4840
5036
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4841
5037
  }
4842
5038
 
@@ -4905,7 +5101,7 @@ template <bool need_check> static __global__ void
4905
5101
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4906
5102
  #else
4907
5103
  (void) vec_dot_q3_K_q8_1_mul_mat;
4908
- bad_arch();
5104
+ NO_DEVICE_CODE;
4909
5105
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4910
5106
  }
4911
5107
 
@@ -4974,7 +5170,7 @@ template <bool need_check> static __global__ void
4974
5170
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4975
5171
  #else
4976
5172
  (void) vec_dot_q4_K_q8_1_mul_mat;
4977
- bad_arch();
5173
+ NO_DEVICE_CODE;
4978
5174
  #endif // __CUDA_ARCH__ >= CC_VOLTA
4979
5175
  }
4980
5176
 
@@ -5041,7 +5237,7 @@ mul_mat_q5_K(
5041
5237
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5042
5238
  #else
5043
5239
  (void) vec_dot_q5_K_q8_1_mul_mat;
5044
- bad_arch();
5240
+ NO_DEVICE_CODE;
5045
5241
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5046
5242
  }
5047
5243
 
@@ -5110,45 +5306,74 @@ template <bool need_check> static __global__ void
5110
5306
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
5111
5307
  #else
5112
5308
  (void) vec_dot_q6_K_q8_1_mul_mat;
5113
- bad_arch();
5309
+ NO_DEVICE_CODE;
5114
5310
  #endif // __CUDA_ARCH__ >= CC_VOLTA
5115
5311
  }
5116
5312
 
5117
- template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
5118
- static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
5119
- const int row = blockIdx.x*blockDim.y + threadIdx.y;
5313
+ #define MMVQ_NWARPS_NVIDIA 4
5314
+ #define MMVQ_NWARPS_AMD_RDNA2 1
5315
+ #define MMVQ_NWARPS_AMD_OLD 4
5120
5316
 
5121
- if (row >= nrows) {
5122
- return;
5123
- }
5317
+ template <int nwarps, int ncols_y_template, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
5318
+ #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
5319
+ __launch_bounds__(nwarps*WARP_SIZE, 1) // tells the compiler to use as many registers as it wants
5320
+ #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
5321
+ static __global__ void mul_mat_vec_q(
5322
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
5323
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y_par, const int nrows_dst) {
5324
+
5325
+ const int ncols_y = ncols_y_template != 0 ? ncols_y_template : ncols_y_par;
5326
+
5327
+ const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
5328
+ const int row = blockIdx.x;
5124
5329
 
5125
- const int blocks_per_row = ncols / qk;
5126
- const int blocks_per_warp = vdr * WARP_SIZE / qi;
5330
+ const int blocks_per_row_x = ncols_x / qk;
5331
+ const int blocks_per_col_y = nrows_y / QK8_1;
5332
+ const int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi;
5127
5333
 
5128
5334
  // partial sum for each thread
5129
- float tmp = 0.0f;
5335
+ float tmp[ncols_y_template != 0 ? ncols_y_template : 8] = {0.0f};
5130
5336
 
5131
5337
  const block_q_t * x = (const block_q_t *) vx;
5132
5338
  const block_q8_1 * y = (const block_q8_1 *) vy;
5133
5339
 
5134
- for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
5135
- const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
5340
+ for (int i = tid / (qi/vdr); i < blocks_per_row_x; i += blocks_per_iter) {
5341
+ const int ibx = row*blocks_per_row_x + i; // x block index
5136
5342
 
5137
- const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
5343
+ const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
5138
5344
 
5139
- const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
5345
+ const int iqs = vdr * (tid % (qi/vdr)); // x block quant index when casting the quants to int
5140
5346
 
5141
- tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
5347
+ #pragma unroll
5348
+ for (int j = 0; j < ncols_y; ++j) {
5349
+ tmp[j] += vec_dot_q_cuda(&x[ibx], &y[j*blocks_per_col_y + iby], iqs);
5350
+ }
5142
5351
  }
5143
5352
 
5144
- // sum up partial sums and write back result
5353
+ __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y_template != 0 ? ncols_y_template : 8][WARP_SIZE];
5354
+ if (threadIdx.y > 0) {
5145
5355
  #pragma unroll
5146
- for (int mask = 16; mask > 0; mask >>= 1) {
5147
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
5356
+ for (int j = 0; j < ncols_y; ++j) {
5357
+ tmp_shared[threadIdx.y-1][j][threadIdx.x] = tmp[j];
5358
+ }
5359
+ }
5360
+ __syncthreads();
5361
+ if (threadIdx.y > 0) {
5362
+ return;
5148
5363
  }
5149
5364
 
5150
- if (threadIdx.x == 0) {
5151
- dst[row] = tmp;
5365
+ // sum up partial sums and write back result
5366
+ #pragma unroll
5367
+ for (int j = 0; j < ncols_y; ++j) {
5368
+ #pragma unroll
5369
+ for (int i = 0; i < nwarps-1; ++i) {
5370
+ tmp[j] += tmp_shared[i][j][threadIdx.x];
5371
+ }
5372
+ tmp[j] = warp_reduce_sum(tmp[j]);
5373
+
5374
+ if (threadIdx.x == 0) {
5375
+ dst[j*nrows_dst + row] = tmp[j];
5376
+ }
5152
5377
  }
5153
5378
  }
5154
5379
 
@@ -5336,27 +5561,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
5336
5561
  *dsti = *xi;
5337
5562
  }
5338
5563
 
5564
+ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
5565
+ const half * xi = (const half *) cxi;
5566
+ float * dsti = (float *) cdsti;
5567
+
5568
+ *dsti = *xi;
5569
+ }
5570
+
5339
5571
  template <cpy_kernel_t cpy_1>
5340
5572
  static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
5341
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5342
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
5573
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
5574
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
5575
+ const int nb12, const int nb13) {
5343
5576
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
5344
5577
 
5345
5578
  if (i >= ne) {
5346
5579
  return;
5347
5580
  }
5348
5581
 
5349
- // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
5582
+ // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
5350
5583
  // then combine those indices with the corresponding byte offsets to get the total offsets
5351
- const int i02 = i / (ne00*ne01);
5352
- const int i01 = (i - i02*ne01*ne00) / ne00;
5353
- const int i00 = i - i02*ne01*ne00 - i01*ne00;
5354
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
5355
-
5356
- const int i12 = i / (ne10*ne11);
5357
- const int i11 = (i - i12*ne10*ne11) / ne10;
5358
- const int i10 = i - i12*ne10*ne11 - i11*ne10;
5359
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
5584
+ const int i03 = i/(ne00 * ne01 * ne02);
5585
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
5586
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
5587
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
5588
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
5589
+
5590
+ const int i13 = i/(ne10 * ne11 * ne12);
5591
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
5592
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
5593
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
5594
+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
5360
5595
 
5361
5596
  cpy_1(cx + x_offset, cdst + dst_offset);
5362
5597
  }
@@ -5450,23 +5685,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
5450
5685
 
5451
5686
  template <cpy_kernel_t cpy_blck, int qk>
5452
5687
  static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
5453
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
5454
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
5688
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
5689
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
5690
+ const int nb12, const int nb13) {
5455
5691
  const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
5456
5692
 
5457
5693
  if (i >= ne) {
5458
5694
  return;
5459
5695
  }
5460
5696
 
5461
- const int i02 = i / (ne00*ne01);
5462
- const int i01 = (i - i02*ne01*ne00) / ne00;
5463
- const int i00 = (i - i02*ne01*ne00 - i01*ne00);
5464
- const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
5697
+ const int i03 = i/(ne00 * ne01 * ne02);
5698
+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
5699
+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
5700
+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
5701
+ const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
5465
5702
 
5466
- const int i12 = i / (ne10*ne11);
5467
- const int i11 = (i - i12*ne10*ne11) / ne10;
5468
- const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
5469
- const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
5703
+ const int i13 = i/(ne10 * ne11 * ne12);
5704
+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
5705
+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
5706
+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
5707
+ const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
5470
5708
 
5471
5709
  cpy_blck(cx + x_offset, cdst + dst_offset);
5472
5710
  }
@@ -5635,7 +5873,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
5635
5873
  }
5636
5874
 
5637
5875
  static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
5638
- const int row = blockIdx.y;
5876
+ const int row = blockIdx.x;
5639
5877
  const int col = threadIdx.x;
5640
5878
 
5641
5879
  float sum = 0.0f;
@@ -5833,7 +6071,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
5833
6071
  }
5834
6072
  #else
5835
6073
  (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
5836
- bad_arch();
6074
+ NO_DEVICE_CODE;
5837
6075
  #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
5838
6076
  }
5839
6077
 
@@ -5957,9 +6195,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
5957
6195
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
5958
6196
  }
5959
6197
 
5960
- static __global__ void im2col_f32_f16(
5961
- const float * x, half * dst,
5962
- int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
6198
+ template <typename T>
6199
+ static __global__ void im2col_kernel(
6200
+ const float * x, T * dst, int batch_offset,
6201
+ int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
5963
6202
  int s0, int s1, int p0, int p1, int d0, int d1) {
5964
6203
  const int i = threadIdx.x + blockIdx.x * blockDim.x;
5965
6204
  if (i >= pelements) {
@@ -5972,21 +6211,73 @@ static __global__ void im2col_f32_f16(
5972
6211
  const int ky = (i - kd) / OW;
5973
6212
  const int ix = i % OW;
5974
6213
 
6214
+ const int oh = blockIdx.y;
6215
+ const int batch = blockIdx.z / IC;
6216
+ const int ic = blockIdx.z % IC;
6217
+
5975
6218
  const int64_t iiw = ix * s0 + kx * d0 - p0;
5976
- const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
6219
+ const int64_t iih = oh * s1 + ky * d1 - p1;
5977
6220
 
5978
6221
  const int64_t offset_dst =
5979
- (blockIdx.y * OW + ix) * CHW +
5980
- (blockIdx.z * (KW * KH) + ky * KW + kx);
6222
+ ((batch * OH + oh) * OW + ix) * CHW +
6223
+ (ic * (KW * KH) + ky * KW + kx);
5981
6224
 
5982
6225
  if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
5983
- dst[offset_dst] = __float2half(0.0f);
6226
+ dst[offset_dst] = 0.0f;
5984
6227
  } else {
5985
- const int64_t offset_src = blockIdx.z * offset_delta;
5986
- dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
6228
+ const int64_t offset_src = ic * offset_delta + batch * batch_offset;
6229
+ dst[offset_dst] = x[offset_src + iih * IW + iiw];
5987
6230
  }
5988
6231
  }
5989
6232
 
6233
+ template <typename Ti, typename To>
6234
+ static __global__ void pool2d_nchw_kernel(
6235
+ const int ih, const int iw, const int oh, const int ow,
6236
+ const int kh, const int kw, const int sh, const int sw,
6237
+ const int ph, const int pw, const int parallel_elements,
6238
+ const Ti* src, To* dst, const enum ggml_op_pool op) {
6239
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
6240
+ if (idx >= parallel_elements) {
6241
+ return;
6242
+ }
6243
+
6244
+ const int I_HW = ih * iw;
6245
+ const int O_HW = oh * ow;
6246
+ const int nc = idx / O_HW;
6247
+ const int cur_oh = idx % O_HW / ow;
6248
+ const int cur_ow = idx % O_HW % ow;
6249
+ const Ti* i_ptr = src + nc * I_HW;
6250
+ To* o_ptr = dst + nc * O_HW;
6251
+ const int start_h = cur_oh * sh - ph;
6252
+ const int bh = max(0, start_h);
6253
+ const int eh = min(ih, start_h + kh);
6254
+ const int start_w = cur_ow * sw - pw;
6255
+ const int bw = max(0, start_w);
6256
+ const int ew = min(iw, start_w + kw);
6257
+ const To scale = 1. / (kh * kw);
6258
+ To res = 0;
6259
+
6260
+ switch (op) {
6261
+ case GGML_OP_POOL_AVG: res = 0; break;
6262
+ case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
6263
+ }
6264
+
6265
+ for (int i = bh; i < eh; i += 1) {
6266
+ for (int j = bw; j < ew; j += 1) {
6267
+ #if __CUDA_ARCH__ >= 350
6268
+ Ti cur = __ldg(i_ptr + i * iw + j);
6269
+ #else
6270
+ Ti cur = i_ptr[i * iw + j];
6271
+ #endif
6272
+ switch (op) {
6273
+ case GGML_OP_POOL_AVG: res += cur * scale; break;
6274
+ case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
6275
+ }
6276
+ }
6277
+ }
6278
+ o_ptr[cur_oh * ow + cur_ow] = res;
6279
+ }
6280
+
5990
6281
  template<int qk, int qr, dequantize_kernel_t dq>
5991
6282
  static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5992
6283
  const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
@@ -6200,6 +6491,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
6200
6491
  relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6201
6492
  }
6202
6493
 
6494
+ static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
6495
+ const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
6496
+ hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6497
+ }
6498
+
6499
+ static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
6500
+ const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
6501
+ hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
6502
+ }
6503
+
6203
6504
  static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
6204
6505
  const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
6205
6506
  leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
@@ -6360,6 +6661,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
6360
6661
  dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
6361
6662
  }
6362
6663
 
6664
+ template<typename dst_t>
6665
+ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6666
+ const int nb = k / QK_K;
6667
+ dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
6668
+ }
6669
+
6363
6670
  template <typename src_t, typename dst_t>
6364
6671
  static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
6365
6672
  const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6397,6 +6704,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6397
6704
  return dequantize_row_iq2_xxs_cuda;
6398
6705
  case GGML_TYPE_IQ2_XS:
6399
6706
  return dequantize_row_iq2_xs_cuda;
6707
+ case GGML_TYPE_IQ3_XXS:
6708
+ return dequantize_row_iq3_xxs_cuda;
6400
6709
  case GGML_TYPE_F32:
6401
6710
  return convert_unary_cuda<float>;
6402
6711
  default:
@@ -6430,6 +6739,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
6430
6739
  return dequantize_row_iq2_xxs_cuda;
6431
6740
  case GGML_TYPE_IQ2_XS:
6432
6741
  return dequantize_row_iq2_xs_cuda;
6742
+ case GGML_TYPE_IQ3_XXS:
6743
+ return dequantize_row_iq3_xxs_cuda;
6433
6744
  case GGML_TYPE_F16:
6434
6745
  return convert_unary_cuda<half>;
6435
6746
  default:
@@ -6534,112 +6845,75 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
6534
6845
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
6535
6846
  }
6536
6847
 
6537
- static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6538
- GGML_ASSERT(ncols % QK4_0 == 0);
6539
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6540
- const dim3 block_nums(block_num_y, 1, 1);
6541
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6542
- mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
6543
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6544
- }
6848
+ template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
6849
+ static void mul_mat_vec_q_cuda(
6850
+ const void * vx, const void * vy, float * dst,
6851
+ const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
6545
6852
 
6546
- static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6547
- GGML_ASSERT(ncols % QK4_1 == 0);
6548
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6549
- const dim3 block_nums(block_num_y, 1, 1);
6550
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6551
- mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
6552
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6553
- }
6853
+ GGML_ASSERT(ncols_x % qk == 0);
6854
+ GGML_ASSERT(ncols_y <= 4);
6554
6855
 
6555
- static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6556
- GGML_ASSERT(ncols % QK5_0 == 0);
6557
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6558
- const dim3 block_nums(block_num_y, 1, 1);
6559
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6560
- mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
6561
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6562
- }
6563
-
6564
- static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6565
- GGML_ASSERT(ncols % QK5_1 == 0);
6566
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6567
- const dim3 block_nums(block_num_y, 1, 1);
6568
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6569
- mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
6570
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6571
- }
6572
-
6573
- static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6574
- GGML_ASSERT(ncols % QK8_0 == 0);
6575
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6576
- const dim3 block_nums(block_num_y, 1, 1);
6577
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6578
- mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
6579
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6580
- }
6581
-
6582
- static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6583
- GGML_ASSERT(ncols % QK_K == 0);
6584
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6585
- const dim3 block_nums(block_num_y, 1, 1);
6586
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6587
- mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
6588
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6589
- }
6590
-
6591
- static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6592
- GGML_ASSERT(ncols % QK_K == 0);
6593
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6594
- const dim3 block_nums(block_num_y, 1, 1);
6595
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6596
- mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
6597
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6598
- }
6599
-
6600
- static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6601
- GGML_ASSERT(ncols % QK_K == 0);
6602
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6603
- const dim3 block_nums(block_num_y, 1, 1);
6604
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6605
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
6606
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6607
- }
6856
+ int id;
6857
+ CUDA_CHECK(cudaGetDevice(&id));
6608
6858
 
6609
- static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6610
- GGML_ASSERT(ncols % QK_K == 0);
6611
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6612
- const dim3 block_nums(block_num_y, 1, 1);
6613
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6614
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
6615
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6616
- }
6859
+ int nwarps;
6860
+ if (g_device_caps[id].cc >= CC_OFFSET_AMD) {
6861
+ nwarps = g_device_caps[id].cc >= CC_RDNA2 ? MMVQ_NWARPS_AMD_RDNA2 : MMVQ_NWARPS_AMD_OLD;
6862
+ } else {
6863
+ nwarps = MMVQ_NWARPS_NVIDIA;
6864
+ }
6617
6865
 
6618
- static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6619
- GGML_ASSERT(ncols % QK_K == 0);
6620
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6621
- const dim3 block_nums(block_num_y, 1, 1);
6622
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6623
- mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
6624
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6625
- }
6866
+ const dim3 block_nums(nrows_x, 1, 1);
6867
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
6626
6868
 
6627
- static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6628
- GGML_ASSERT(ncols % QK_K == 0);
6629
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6630
- const dim3 block_nums(block_num_y, 1, 1);
6631
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6632
- mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
6633
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6634
- }
6869
+ switch (nwarps) {
6870
+ case 1: switch(ncols_y) {
6871
+ case 1:
6872
+ mul_mat_vec_q<1, 1, qk, qi, block_q_t, vdr, vec_dot>
6873
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6874
+ break;
6875
+ case 2:
6876
+ mul_mat_vec_q<1, 2, qk, qi, block_q_t, vdr, vec_dot>
6877
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6878
+ break;
6879
+ case 3:
6880
+ mul_mat_vec_q<1, 3, qk, qi, block_q_t, vdr, vec_dot>
6881
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6882
+ break;
6883
+ case 4:
6884
+ mul_mat_vec_q<1, 4, qk, qi, block_q_t, vdr, vec_dot>
6885
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6886
+ break;
6887
+ default:
6888
+ GGML_ASSERT(false);
6889
+ break;
6890
+ } break;
6891
+ case 4: switch(ncols_y) {
6892
+ case 1:
6893
+ mul_mat_vec_q<4, 1, qk, qi, block_q_t, vdr, vec_dot>
6894
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6895
+ break;
6896
+ case 2:
6897
+ mul_mat_vec_q<4, 2, qk, qi, block_q_t, vdr, vec_dot>
6898
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6899
+ break;
6900
+ case 3:
6901
+ mul_mat_vec_q<4, 3, qk, qi, block_q_t, vdr, vec_dot>
6902
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6903
+ break;
6904
+ case 4:
6905
+ mul_mat_vec_q<4, 4, qk, qi, block_q_t, vdr, vec_dot>
6906
+ <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst);
6907
+ break;
6908
+ default:
6909
+ GGML_ASSERT(false);
6910
+ break;
6911
+ } break;
6635
6912
 
6636
- static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
6637
- GGML_ASSERT(ncols % QK_K == 0);
6638
- const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
6639
- const dim3 block_nums(block_num_y, 1, 1);
6640
- const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
6641
- mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
6642
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
6913
+ default:
6914
+ GGML_ASSERT(false);
6915
+ break;
6916
+ }
6643
6917
  }
6644
6918
 
6645
6919
  static void ggml_mul_mat_q4_0_q8_1_cuda(
@@ -7114,69 +7388,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
7114
7388
  (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
7115
7389
  }
7116
7390
 
7391
+
7392
+ static void ggml_cpy_f16_f32_cuda(
7393
+ const char * cx, char * cdst, const int ne,
7394
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7395
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7396
+
7397
+ const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
7398
+ cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
7399
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7400
+ }
7401
+
7117
7402
  static void ggml_cpy_f32_f32_cuda(
7118
7403
  const char * cx, char * cdst, const int ne,
7119
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7120
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7404
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7405
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7121
7406
 
7122
7407
  const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
7123
7408
  cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
7124
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7409
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7125
7410
  }
7126
7411
 
7127
7412
  static void ggml_cpy_f32_f16_cuda(
7128
7413
  const char * cx, char * cdst, const int ne,
7129
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7130
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7414
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7415
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7131
7416
 
7132
7417
  const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
7133
7418
  cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
7134
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7419
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7135
7420
  }
7136
7421
 
7137
7422
  static void ggml_cpy_f32_q8_0_cuda(
7138
7423
  const char * cx, char * cdst, const int ne,
7139
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7140
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7424
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7425
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7141
7426
 
7142
7427
  GGML_ASSERT(ne % QK8_0 == 0);
7143
7428
  const int num_blocks = ne / QK8_0;
7144
7429
  cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
7145
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7430
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7146
7431
  }
7147
7432
 
7148
7433
  static void ggml_cpy_f32_q4_0_cuda(
7149
7434
  const char * cx, char * cdst, const int ne,
7150
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7151
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7435
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7436
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7152
7437
 
7153
7438
  GGML_ASSERT(ne % QK4_0 == 0);
7154
7439
  const int num_blocks = ne / QK4_0;
7155
7440
  cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
7156
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7441
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7157
7442
  }
7158
7443
 
7159
7444
  static void ggml_cpy_f32_q4_1_cuda(
7160
7445
  const char * cx, char * cdst, const int ne,
7161
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7162
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7446
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7447
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7163
7448
 
7164
7449
  GGML_ASSERT(ne % QK4_1 == 0);
7165
7450
  const int num_blocks = ne / QK4_1;
7166
7451
  cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
7167
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7452
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7168
7453
  }
7169
7454
 
7170
7455
  static void ggml_cpy_f16_f16_cuda(
7171
7456
  const char * cx, char * cdst, const int ne,
7172
- const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
7173
- const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
7457
+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
7458
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
7174
7459
 
7175
7460
  const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
7176
7461
  cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
7177
- (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
7462
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
7178
7463
  }
7179
7464
 
7465
+
7466
+
7180
7467
  static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
7181
7468
  const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
7182
7469
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -7255,7 +7542,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
7255
7542
 
7256
7543
  static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
7257
7544
  const dim3 block_dims(WARP_SIZE, 1, 1);
7258
- const dim3 block_nums(1, nrows, 1);
7545
+ const dim3 block_nums(nrows, 1, 1);
7259
7546
  k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
7260
7547
  }
7261
7548
 
@@ -7367,14 +7654,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
7367
7654
  }
7368
7655
  }
7369
7656
 
7370
- static void im2col_f32_f16_cuda(const float* x, half* dst,
7657
+ template <typename T>
7658
+ static void im2col_cuda(const float* x, T* dst,
7371
7659
  int IW, int IH, int OW, int OH, int KW, int KH, int IC,
7372
- int offset_delta,
7660
+ int batch, int batch_offset, int offset_delta,
7373
7661
  int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
7374
7662
  const int parallel_elements = OW * KW * KH;
7375
7663
  const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
7376
- dim3 block_nums(num_blocks, OH, IC);
7377
- im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
7664
+ dim3 block_nums(num_blocks, OH, batch * IC);
7665
+ im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
7378
7666
  }
7379
7667
 
7380
7668
  // buffer pool for cuda
@@ -7959,6 +8247,34 @@ static void ggml_cuda_op_relu(
7959
8247
  (void) src1_dd;
7960
8248
  }
7961
8249
 
8250
+ static void ggml_cuda_op_hardsigmoid(
8251
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8252
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8253
+
8254
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8255
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8256
+
8257
+ hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
8258
+
8259
+ (void) src1;
8260
+ (void) dst;
8261
+ (void) src1_dd;
8262
+ }
8263
+
8264
+ static void ggml_cuda_op_hardswish(
8265
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8266
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8267
+
8268
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8269
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8270
+
8271
+ hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
8272
+
8273
+ (void) src1;
8274
+ (void) dst;
8275
+ (void) src1_dd;
8276
+ }
8277
+
7962
8278
  static void ggml_cuda_op_leaky_relu(
7963
8279
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7964
8280
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@@ -8114,7 +8430,7 @@ static void ggml_cuda_op_mul_mat_q(
8114
8430
  CUDA_CHECK(cudaGetDevice(&id));
8115
8431
 
8116
8432
  // the main device has a larger memory buffer to hold the results from all GPUs
8117
- // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
8433
+ // nrows_dst == nrows of the matrix that the kernel writes into
8118
8434
  const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8119
8435
 
8120
8436
  switch (src0->type) {
@@ -8192,6 +8508,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8192
8508
  case GGML_TYPE_Q6_K:
8193
8509
  case GGML_TYPE_IQ2_XXS:
8194
8510
  case GGML_TYPE_IQ2_XS:
8511
+ case GGML_TYPE_IQ3_XXS:
8195
8512
  return max_compute_capability >= CC_RDNA2 ? 128 : 64;
8196
8513
  default:
8197
8514
  GGML_ASSERT(false);
@@ -8214,6 +8531,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
8214
8531
  case GGML_TYPE_Q5_K:
8215
8532
  case GGML_TYPE_IQ2_XXS:
8216
8533
  case GGML_TYPE_IQ2_XS:
8534
+ case GGML_TYPE_IQ3_XXS:
8217
8535
  return max_compute_capability >= CC_VOLTA ? 128 : 64;
8218
8536
  case GGML_TYPE_Q6_K:
8219
8537
  return 64;
@@ -8243,47 +8561,73 @@ static void ggml_cuda_op_mul_mat_vec_q(
8243
8561
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
8244
8562
  const int64_t src1_padded_row_size, cudaStream_t stream) {
8245
8563
 
8246
- GGML_ASSERT(ggml_nrows(src1) == 1);
8247
-
8248
8564
  const int64_t ne00 = src0->ne[0];
8249
8565
  const int64_t row_diff = row_high - row_low;
8250
8566
 
8567
+ const int64_t ne10 = src1->ne[0];
8568
+ GGML_ASSERT(ne10 % QK8_1 == 0);
8569
+
8570
+ const int64_t ne0 = dst->ne[0];
8571
+
8572
+ int id;
8573
+ CUDA_CHECK(cudaGetDevice(&id));
8574
+
8575
+ // the main device has a larger memory buffer to hold the results from all GPUs
8576
+ // nrows_dst == nrows of the matrix that the kernel writes into
8577
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
8578
+
8251
8579
  switch (src0->type) {
8252
8580
  case GGML_TYPE_Q4_0:
8253
- mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8581
+ mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
8582
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8254
8583
  break;
8255
8584
  case GGML_TYPE_Q4_1:
8256
- mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8585
+ mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
8586
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8257
8587
  break;
8258
8588
  case GGML_TYPE_Q5_0:
8259
- mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8589
+ mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
8590
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8260
8591
  break;
8261
8592
  case GGML_TYPE_Q5_1:
8262
- mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8593
+ mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
8594
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8263
8595
  break;
8264
8596
  case GGML_TYPE_Q8_0:
8265
- mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8597
+ mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
8598
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8266
8599
  break;
8267
8600
  case GGML_TYPE_Q2_K:
8268
- mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8601
+ mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
8602
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8269
8603
  break;
8270
8604
  case GGML_TYPE_Q3_K:
8271
- mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8605
+ mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
8606
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8272
8607
  break;
8273
8608
  case GGML_TYPE_Q4_K:
8274
- mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8609
+ mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
8610
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8275
8611
  break;
8276
8612
  case GGML_TYPE_Q5_K:
8277
- mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8613
+ mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
8614
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8278
8615
  break;
8279
8616
  case GGML_TYPE_Q6_K:
8280
- mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8617
+ mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
8618
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8281
8619
  break;
8282
8620
  case GGML_TYPE_IQ2_XXS:
8283
- mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8621
+ mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
8622
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8284
8623
  break;
8285
8624
  case GGML_TYPE_IQ2_XS:
8286
- mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
8625
+ mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
8626
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8627
+ break;
8628
+ case GGML_TYPE_IQ3_XXS:
8629
+ mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
8630
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
8287
8631
  break;
8288
8632
  default:
8289
8633
  GGML_ASSERT(false);
@@ -8319,9 +8663,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
8319
8663
 
8320
8664
  if (src1_convert_f16) {
8321
8665
  src1_dfloat = src1_dfloat_a.alloc(ne00);
8322
- ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
8323
- ne00, 1, sizeof(float), 0, 0,
8324
- ne00, 1, sizeof(half), 0, 0, stream);
8666
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8667
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8668
+ to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
8325
8669
  }
8326
8670
  #else
8327
8671
  const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
@@ -8585,13 +8929,46 @@ static void ggml_cuda_op_alibi(
8585
8929
  (void) src1_dd;
8586
8930
  }
8587
8931
 
8932
+ static void ggml_cuda_op_pool2d(
8933
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8934
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8935
+
8936
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
8937
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
8938
+
8939
+ const int32_t * opts = (const int32_t *)dst->op_params;
8940
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
8941
+ const int k0 = opts[1];
8942
+ const int k1 = opts[2];
8943
+ const int s0 = opts[3];
8944
+ const int s1 = opts[4];
8945
+ const int p0 = opts[5];
8946
+ const int p1 = opts[6];
8947
+
8948
+ const int64_t IH = src0->ne[1];
8949
+ const int64_t IW = src0->ne[0];
8950
+
8951
+ const int64_t N = dst->ne[3];
8952
+ const int64_t OC = dst->ne[2];
8953
+ const int64_t OH = dst->ne[1];
8954
+ const int64_t OW = dst->ne[0];
8955
+
8956
+ const int parallel_elements = N * OC * OH * OW;
8957
+ const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
8958
+ dim3 block_nums(num_blocks);
8959
+ pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
8960
+
8961
+ (void) src1;
8962
+ (void) src1_dd;
8963
+ }
8964
+
8588
8965
  static void ggml_cuda_op_im2col(
8589
8966
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
8590
8967
  const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
8591
8968
 
8592
8969
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8593
8970
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
8594
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
8971
+ GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
8595
8972
 
8596
8973
  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
8597
8974
  const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -8613,8 +8990,14 @@ static void ggml_cuda_op_im2col(
8613
8990
  const int64_t OW = dst->ne[1];
8614
8991
 
8615
8992
  const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
8993
+ const int64_t batch = src1->ne[3];
8994
+ const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
8616
8995
 
8617
- im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
8996
+ if(dst->type == GGML_TYPE_F16) {
8997
+ im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
8998
+ } else {
8999
+ im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
9000
+ }
8618
9001
 
8619
9002
  (void) src0;
8620
9003
  (void) src0_dd;
@@ -9210,6 +9593,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
9210
9593
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
9211
9594
  }
9212
9595
 
9596
+ static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9597
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
9598
+ }
9599
+
9600
+ static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9601
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
9602
+ }
9213
9603
  static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9214
9604
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
9215
9605
  }
@@ -9561,17 +9951,18 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9561
9951
  #ifdef GGML_CUDA_FORCE_DMMV
9562
9952
  const bool use_mul_mat_vec_q = false;
9563
9953
  #else
9564
- const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
9954
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
9565
9955
  #endif // GGML_CUDA_FORCE_DMMV
9566
9956
 
9567
9957
  if (use_mul_mat_vec_q) {
9568
- // NOTE: this kernel does not support ggml_nrows(src1) > 1
9569
9958
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
9570
9959
  } else {
9571
9960
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
9572
9961
  }
9573
9962
  } else {
9574
- if (use_mul_mat_q) {
9963
+ if (src1->ne[1] <= 4 && min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32) {
9964
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
9965
+ } else if (use_mul_mat_q) {
9575
9966
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
9576
9967
  } else {
9577
9968
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -9769,8 +10160,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
9769
10160
  // TODO: mmq/mmv support
9770
10161
  #endif
9771
10162
 
9772
- const int64_t nb11 = src1->nb[1];
9773
- const int64_t nb1 = dst->nb[1];
10163
+ const size_t nb11 = src1->nb[1];
10164
+ const size_t nb1 = dst->nb[1];
9774
10165
 
9775
10166
  const struct ggml_tensor * ids = src0;
9776
10167
  const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -9920,19 +10311,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
9920
10311
 
9921
10312
  const int64_t ne00 = src0->ne[0];
9922
10313
  const int64_t ne01 = src0->ne[1];
9923
- GGML_ASSERT(src0->ne[3] == 1);
10314
+ const int64_t ne02 = src0->ne[2];
10315
+
10316
+ //GGML_ASSERT(src0->ne[3] == 1);
9924
10317
 
9925
10318
  const int64_t nb00 = src0->nb[0];
9926
10319
  const int64_t nb01 = src0->nb[1];
9927
10320
  const int64_t nb02 = src0->nb[2];
10321
+ const int64_t nb03 = src0->nb[3];
9928
10322
 
9929
10323
  const int64_t ne10 = src1->ne[0];
9930
10324
  const int64_t ne11 = src1->ne[1];
9931
- GGML_ASSERT(src1->ne[3] == 1);
10325
+ const int64_t ne12 = src1->ne[2];
10326
+
10327
+ //GGML_ASSERT(src1->ne[3] == 1);
9932
10328
 
9933
10329
  const int64_t nb10 = src1->nb[0];
9934
10330
  const int64_t nb11 = src1->nb[1];
9935
10331
  const int64_t nb12 = src1->nb[2];
10332
+ const int64_t nb13 = src1->nb[3];
9936
10333
 
9937
10334
  ggml_cuda_set_device(g_main_device);
9938
10335
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@@ -9944,17 +10341,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
9944
10341
  char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
9945
10342
 
9946
10343
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
9947
- ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10344
+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9948
10345
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
9949
- ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10346
+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9950
10347
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
9951
- ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10348
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9952
10349
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
9953
- ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10350
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9954
10351
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
9955
- ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10352
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9956
10353
  } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
9957
- ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
10354
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
10355
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
10356
+ ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
9958
10357
  } else {
9959
10358
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
9960
10359
  ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -9987,6 +10386,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
9987
10386
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
9988
10387
  }
9989
10388
 
10389
+ static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10390
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
10391
+ }
10392
+
9990
10393
  static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9991
10394
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
9992
10395
  }
@@ -10088,6 +10491,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10088
10491
  case GGML_UNARY_OP_RELU:
10089
10492
  func = ggml_cuda_relu;
10090
10493
  break;
10494
+ case GGML_UNARY_OP_HARDSIGMOID:
10495
+ func = ggml_cuda_hardsigmoid;
10496
+ break;
10497
+ case GGML_UNARY_OP_HARDSWISH:
10498
+ func = ggml_cuda_hardswish;
10499
+ break;
10091
10500
  default:
10092
10501
  return false;
10093
10502
  }
@@ -10162,6 +10571,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
10162
10571
  case GGML_OP_IM2COL:
10163
10572
  func = ggml_cuda_im2col;
10164
10573
  break;
10574
+ case GGML_OP_POOL_2D:
10575
+ func = ggml_cuda_pool2d;
10576
+ break;
10165
10577
  case GGML_OP_SUM_ROWS:
10166
10578
  func = ggml_cuda_sum_rows;
10167
10579
  break;
@@ -10283,15 +10695,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
10283
10695
 
10284
10696
  if (ggml_is_quantized(tensor->type)) {
10285
10697
  // initialize padding to 0 to avoid possible NaN values
10286
- int64_t row_low = 0;
10287
- int64_t row_high = ggml_nrows(tensor);
10288
- int64_t nrows_split = row_high - row_low;
10289
-
10290
- size_t original_size = ggml_nbytes_split(tensor, nrows_split);
10698
+ size_t original_size = ggml_nbytes(tensor);
10291
10699
  size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
10292
10700
 
10293
10701
  if (padded_size > original_size && tensor->view_src == nullptr) {
10294
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10702
+ CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
10295
10703
  }
10296
10704
  }
10297
10705
  }
@@ -10394,12 +10802,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
10394
10802
  }
10395
10803
 
10396
10804
  GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10397
- int64_t row_low = 0;
10398
- int64_t row_high = ggml_nrows(tensor);
10399
- int64_t nrows_split = row_high - row_low;
10400
-
10401
- size_t size = ggml_nbytes_split(tensor, nrows_split);
10402
-
10805
+ size_t size = ggml_nbytes(tensor);
10403
10806
  int64_t ne0 = tensor->ne[0];
10404
10807
 
10405
10808
  if (ggml_is_quantized(tensor->type)) {
@@ -10428,6 +10831,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10428
10831
  /* .get_name = */ ggml_backend_cuda_buffer_type_name,
10429
10832
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
10430
10833
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
10834
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
10431
10835
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
10432
10836
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
10433
10837
  /* .is_host = */ NULL,
@@ -10703,6 +11107,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
10703
11107
  /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
10704
11108
  /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
10705
11109
  /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
11110
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
10706
11111
  /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
10707
11112
  /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
10708
11113
  /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
@@ -10782,6 +11187,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10782
11187
  /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
10783
11188
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
10784
11189
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
11190
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
10785
11191
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
10786
11192
  /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
10787
11193
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
@@ -10896,6 +11302,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
10896
11302
  case GGML_UNARY_OP_GELU:
10897
11303
  case GGML_UNARY_OP_SILU:
10898
11304
  case GGML_UNARY_OP_RELU:
11305
+ case GGML_UNARY_OP_HARDSIGMOID:
11306
+ case GGML_UNARY_OP_HARDSWISH:
10899
11307
  case GGML_UNARY_OP_GELU_QUICK:
10900
11308
  case GGML_UNARY_OP_TANH:
10901
11309
  return true;
@@ -10918,6 +11326,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
10918
11326
  if (a->ne[3] != b->ne[3]) {
10919
11327
  return false;
10920
11328
  }
11329
+ ggml_type a_type = a->type;
11330
+ if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
11331
+ if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
11332
+ return false;
11333
+ }
11334
+ }
10921
11335
  return true;
10922
11336
  } break;
10923
11337
  case GGML_OP_GET_ROWS:
@@ -10957,6 +11371,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
10957
11371
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
10958
11372
  return true;
10959
11373
  }
11374
+ if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
11375
+ return true;
11376
+ }
10960
11377
  return false;
10961
11378
  } break;
10962
11379
  case GGML_OP_DUP:
@@ -10985,6 +11402,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
10985
11402
  case GGML_OP_ROPE:
10986
11403
  case GGML_OP_ALIBI:
10987
11404
  case GGML_OP_IM2COL:
11405
+ case GGML_OP_POOL_2D:
10988
11406
  case GGML_OP_SUM_ROWS:
10989
11407
  case GGML_OP_ARGSORT:
10990
11408
  case GGML_OP_ACC: