llama_cpp 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1399,6 +1501,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1399
1501
  // second part effectively subtracts 8 from each quant value
1400
1502
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1401
1503
  #else
1504
+ assert(false);
1402
1505
  return 0.0f; // only to satisfy the compiler
1403
1506
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1404
1507
  }
@@ -1436,6 +1539,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1436
1539
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
1540
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1438
1541
  #else
1542
+ assert(false);
1439
1543
  return 0.0f; // only to satisfy the compiler
1440
1544
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1441
1545
  }
@@ -1471,6 +1575,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1471
1575
  // second part effectively subtracts 16 from each quant value
1472
1576
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1473
1577
  #else
1578
+ assert(false);
1474
1579
  return 0.0f; // only to satisfy the compiler
1475
1580
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1476
1581
  }
@@ -1516,6 +1621,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1516
1621
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1517
1622
 
1518
1623
  #else
1624
+ assert(false);
1519
1625
  return 0.0f; // only to satisfy the compiler
1520
1626
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1521
1627
  }
@@ -1537,6 +1643,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
1537
1643
 
1538
1644
  return d8_0*d8_1 * sumi;
1539
1645
  #else
1646
+ assert(false);
1540
1647
  return 0.0f; // only to satisfy the compiler
1541
1648
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1542
1649
  }
@@ -1567,6 +1674,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1567
1674
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
1675
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
1676
  #else
1677
+ assert(false);
1570
1678
  return 0.0f; // only to satisfy the compiler
1571
1679
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
1680
  }
@@ -1602,6 +1710,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1602
1710
 
1603
1711
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
1712
  #else
1713
+ assert(false);
1605
1714
  return 0.0f; // only to satisfy the compiler
1606
1715
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
1716
  }
@@ -1639,6 +1748,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1639
1748
 
1640
1749
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
1750
  #else
1751
+ assert(false);
1642
1752
  return 0.0f; // only to satisfy the compiler
1643
1753
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
1754
  }
@@ -1679,6 +1789,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1679
1789
 
1680
1790
  return d3 * sumf;
1681
1791
  #else
1792
+ assert(false);
1682
1793
  return 0.0f; // only to satisfy the compiler
1683
1794
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
1795
  }
@@ -1704,6 +1815,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1704
1815
 
1705
1816
  return d3*d8 * sumi;
1706
1817
  #else
1818
+ assert(false);
1707
1819
  return 0.0f; // only to satisfy the compiler
1708
1820
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
1821
  }
@@ -1737,12 +1849,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1737
1849
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
1850
 
1739
1851
  #else
1852
+ assert(false);
1740
1853
  return 0.0f; // only to satisfy the compiler
1741
1854
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
1855
  }
1743
1856
 
1744
1857
  // contiguous u/y values
1745
- // also used for q5_K
1746
1858
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
1859
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
1860
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
@@ -1752,19 +1864,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1752
1864
  float sumf_m = 0.0f;
1753
1865
 
1754
1866
  #pragma unroll
1755
- for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1867
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1756
1868
  int sumi_d = 0;
1757
1869
 
1758
1870
  #pragma unroll
1759
- for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
- sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
- sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1871
+ for (int j = 0; j < QI8_1; ++j) {
1872
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1762
1873
  }
1763
1874
 
1764
- const float2 ds8f = __half22float2(ds8[i0 / 4]);
1875
+ const float2 ds8f = __half22float2(ds8[i]);
1765
1876
 
1766
- sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
- sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1877
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1878
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1768
1879
  }
1769
1880
 
1770
1881
  const float2 dm4f = __half22float2(dm4);
@@ -1772,6 +1883,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1772
1883
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
1884
 
1774
1885
  #else
1886
+ assert(false);
1775
1887
  return 0.0f; // only to satisfy the compiler
1776
1888
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
1889
  }
@@ -1780,7 +1892,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1780
1892
  #define VDR_Q5_K_Q8_1_MMQ 8
1781
1893
 
1782
1894
  // contiguous v/x values
1783
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1895
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1784
1896
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
1897
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
1898
 
@@ -1812,6 +1924,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1812
1924
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
1925
 
1814
1926
  #else
1927
+ assert(false);
1928
+ return 0.0f; // only to satisfy the compiler
1929
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1930
+ }
1931
+
1932
+ // contiguous u/y values
1933
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1934
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1935
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1936
+
1937
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1938
+ float sumf_d = 0.0f;
1939
+ float sumf_m = 0.0f;
1940
+
1941
+ #pragma unroll
1942
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1943
+ int sumi_d = 0;
1944
+
1945
+ #pragma unroll
1946
+ for (int j = 0; j < QI8_1; ++j) {
1947
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1948
+ }
1949
+
1950
+ const float2 ds8f = __half22float2(ds8[i]);
1951
+
1952
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1953
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1954
+ }
1955
+
1956
+ const float2 dm4f = __half22float2(dm4);
1957
+
1958
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1959
+
1960
+ #else
1961
+ assert(false);
1815
1962
  return 0.0f; // only to satisfy the compiler
1816
1963
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
1964
  }
@@ -1842,6 +1989,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1842
1989
 
1843
1990
  return d*sumf;
1844
1991
  #else
1992
+ assert(false);
1845
1993
  return 0.0f; // only to satisfy the compiler
1846
1994
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
1995
  }
@@ -1873,6 +2021,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1873
2021
  return d6 * sumf_d;
1874
2022
 
1875
2023
  #else
2024
+ assert(false);
1876
2025
  return 0.0f; // only to satisfy the compiler
1877
2026
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
2027
  }
@@ -2298,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2298
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2299
2448
  }
2300
2449
 
2301
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2302
2451
  }
2303
2452
 
2304
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2384,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2384
2533
  #pragma unroll
2385
2534
  for (int i = 0; i < QR2_K; ++ i) {
2386
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2387
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2388
2537
  }
2389
2538
 
2390
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2503,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2503
2652
  #pragma unroll
2504
2653
  for (int i = 0; i < QR3_K; ++i) {
2505
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2506
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2507
2656
  }
2508
2657
 
2509
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2672,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2672
2821
 
2673
2822
  for (int i = 0; i < QR4_K; ++i) {
2674
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2675
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2676
2825
 
2677
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2678
2827
  u[2*i+0] = q8[0];
@@ -2699,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2699
2848
  const float dall = bq4_K->d[0];
2700
2849
  const float dmin = bq4_K->d[1];
2701
2850
 
2702
- const float d8_1 = bq8_1[0].ds.x;
2703
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2704
2853
 
2705
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2706
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2722,6 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2722
2871
  return dall * sumf_d - dmin * sumf_m;
2723
2872
 
2724
2873
  #else
2874
+ assert(false);
2725
2875
  return 0.0f; // only to satisfy the compiler
2726
2876
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
2877
 
@@ -2808,18 +2958,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
2958
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
2959
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
2960
 
2811
- int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
-
2813
- #pragma unroll
2814
- for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
- v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
- v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
- }
2818
-
2819
2961
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
2962
 
2821
2963
  const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
- return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2964
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2965
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
2966
  }
2824
2967
 
2825
2968
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2859,14 +3002,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2859
3002
  #pragma unroll
2860
3003
  for (int i = 0; i < QR5_K; ++i) {
2861
3004
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2862
- d8[i] = bq8i->ds.x;
3005
+ d8[i] = __low2float(bq8i->ds);
2863
3006
 
2864
3007
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2865
3008
  u[2*i+0] = q8[0];
2866
3009
  u[2*i+1] = q8[4];
2867
3010
  }
2868
3011
 
2869
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
3012
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
3013
 
2871
3014
  #else
2872
3015
 
@@ -2877,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2877
3020
 
2878
3021
  const float d = bq5_K->d;
2879
3022
 
2880
- const float d8_1 = bq8_1[0].ds.x;
2881
- const float d8_2 = bq8_1[1].ds.x;
3023
+ const float d8_1 = __low2half(bq8_1[0].ds);
3024
+ const float d8_2 = __low2half(bq8_1[1].ds);
2882
3025
 
2883
3026
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2884
3027
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2905,6 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2905
3048
  return d * sumf_d;
2906
3049
 
2907
3050
  #else
3051
+ assert(false);
2908
3052
  return 0.0f; // only to satisfy the compiler
2909
3053
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
3054
 
@@ -3008,7 +3152,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3008
3152
 
3009
3153
  const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
3154
  const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3155
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3156
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
3157
  }
3013
3158
 
3014
3159
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -3031,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3031
3176
  #pragma unroll
3032
3177
  for (int i = 0; i < QR6_K; ++i) {
3033
3178
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3034
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3179
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3035
3180
  }
3036
3181
 
3037
3182
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3135,7 +3280,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3135
3280
 
3136
3281
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
3282
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
- static __global__ void mul_mat_q(
3283
+ static __device__ __forceinline__ void mul_mat_q(
3139
3284
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
3285
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
3286
 
@@ -3150,7 +3295,6 @@ static __global__ void mul_mat_q(
3150
3295
 
3151
3296
  const int row_dst_0 = blockIdx.x*mmq_y;
3152
3297
  const int & row_x_0 = row_dst_0;
3153
- const int row_dst = row_dst_0 + threadIdx.x;
3154
3298
 
3155
3299
  const int col_dst_0 = blockIdx.y*mmq_x;
3156
3300
  const int & col_y_0 = col_dst_0;
@@ -3200,7 +3344,7 @@ static __global__ void mul_mat_q(
3200
3344
  *dsi_dst = *dsi_src;
3201
3345
  } else {
3202
3346
  float * dfi_dst = (float *) dsi_dst;
3203
- *dfi_dst = (*dsi_src).x;
3347
+ *dfi_dst = __low2half(*dsi_src);
3204
3348
  }
3205
3349
  }
3206
3350
 
@@ -3223,11 +3367,7 @@ static __global__ void mul_mat_q(
3223
3367
  }
3224
3368
  }
3225
3369
 
3226
-
3227
- if (row_dst >= nrows_dst) {
3228
- return;
3229
- }
3230
-
3370
+ #pragma unroll
3231
3371
  for (int j = 0; j < mmq_x; j += nwarps) {
3232
3372
  const int col_dst = col_dst_0 + j + threadIdx.y;
3233
3373
 
@@ -3235,12 +3375,375 @@ static __global__ void mul_mat_q(
3235
3375
  return;
3236
3376
  }
3237
3377
 
3378
+ #pragma unroll
3238
3379
  for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3380
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3381
+
3382
+ if (row_dst >= nrows_dst) {
3383
+ continue;
3384
+ }
3385
+
3386
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3240
3387
  }
3241
3388
  }
3242
3389
  }
3243
3390
 
3391
+ #define MMQ_X_Q4_0_AMPERE 64
3392
+ #define MMQ_Y_Q4_0_AMPERE 128
3393
+ #define NWARPS_Q4_0_AMPERE 4
3394
+ #define MMQ_X_Q4_0_PASCAL 64
3395
+ #define MMQ_Y_Q4_0_PASCAL 64
3396
+ #define NWARPS_Q4_0_PASCAL 8
3397
+
3398
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3399
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3400
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3401
+
3402
+ #if __CUDA_ARCH__ >= CC_TURING
3403
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3404
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3405
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3406
+
3407
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3408
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3409
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3410
+
3411
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3412
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3413
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3414
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3415
+
3416
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3417
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3418
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3419
+ #else
3420
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3421
+ assert(false);
3422
+ #endif // __CUDA_ARCH__ >= CC_TURING
3423
+ }
3424
+
3425
+ #define MMQ_X_Q4_1_AMPERE 64
3426
+ #define MMQ_Y_Q4_1_AMPERE 128
3427
+ #define NWARPS_Q4_1_AMPERE 4
3428
+ #define MMQ_X_Q4_1_PASCAL 64
3429
+ #define MMQ_Y_Q4_1_PASCAL 64
3430
+ #define NWARPS_Q4_1_PASCAL 8
3431
+
3432
+ template <bool need_check> static __global__ void
3433
+ #if __CUDA_ARCH__ < CC_TURING
3434
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3435
+ #endif // __CUDA_ARCH__ < CC_TURING
3436
+ mul_mat_q4_1(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3443
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3444
+
3445
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3446
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3452
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3453
+
3454
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3455
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q5_0_AMPERE 128
3464
+ #define MMQ_Y_Q5_0_AMPERE 64
3465
+ #define NWARPS_Q5_0_AMPERE 4
3466
+ #define MMQ_X_Q5_0_PASCAL 64
3467
+ #define MMQ_Y_Q5_0_PASCAL 64
3468
+ #define NWARPS_Q5_0_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3477
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3478
+
3479
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3480
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3486
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3487
+
3488
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3489
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q5_1_AMPERE 128
3498
+ #define MMQ_Y_Q5_1_AMPERE 64
3499
+ #define NWARPS_Q5_1_AMPERE 4
3500
+ #define MMQ_X_Q5_1_PASCAL 64
3501
+ #define MMQ_Y_Q5_1_PASCAL 64
3502
+ #define NWARPS_Q5_1_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3505
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3506
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3507
+
3508
+ #if __CUDA_ARCH__ >= CC_TURING
3509
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3510
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3511
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3512
+
3513
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3514
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3516
+
3517
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3518
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3519
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3520
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3521
+
3522
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3523
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3524
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3525
+ #else
3526
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3527
+ assert(false);
3528
+ #endif // __CUDA_ARCH__ >= CC_TURING
3529
+ }
3530
+
3531
+ #define MMQ_X_Q8_0_AMPERE 128
3532
+ #define MMQ_Y_Q8_0_AMPERE 64
3533
+ #define NWARPS_Q8_0_AMPERE 4
3534
+ #define MMQ_X_Q8_0_PASCAL 64
3535
+ #define MMQ_Y_Q8_0_PASCAL 64
3536
+ #define NWARPS_Q8_0_PASCAL 8
3537
+
3538
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3539
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3540
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3541
+
3542
+ #if __CUDA_ARCH__ >= CC_TURING
3543
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3544
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3545
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3546
+
3547
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3548
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3549
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3550
+
3551
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3552
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3553
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3554
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3555
+
3556
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3557
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3558
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3559
+ #else
3560
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3561
+ assert(false);
3562
+ #endif // __CUDA_ARCH__ >= CC_TURING
3563
+ }
3564
+
3565
+ #define MMQ_X_Q2_K_AMPERE 64
3566
+ #define MMQ_Y_Q2_K_AMPERE 128
3567
+ #define NWARPS_Q2_K_AMPERE 4
3568
+ #define MMQ_X_Q2_K_PASCAL 64
3569
+ #define MMQ_Y_Q2_K_PASCAL 64
3570
+ #define NWARPS_Q2_K_PASCAL 8
3571
+
3572
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3573
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3574
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3575
+
3576
+ #if __CUDA_ARCH__ >= CC_TURING
3577
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3578
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3579
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3580
+
3581
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3582
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3583
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3584
+
3585
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3586
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3587
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3588
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3589
+
3590
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3591
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3592
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3593
+ #else
3594
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3595
+ assert(false);
3596
+ #endif // __CUDA_ARCH__ >= CC_TURING
3597
+ }
3598
+
3599
+ #define MMQ_X_Q3_K_AMPERE 128
3600
+ #define MMQ_Y_Q3_K_AMPERE 128
3601
+ #define NWARPS_Q3_K_AMPERE 4
3602
+ #define MMQ_X_Q3_K_PASCAL 64
3603
+ #define MMQ_Y_Q3_K_PASCAL 64
3604
+ #define NWARPS_Q3_K_PASCAL 8
3605
+
3606
+ template <bool need_check> static __global__ void
3607
+ #if __CUDA_ARCH__ < CC_TURING
3608
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3609
+ #endif // __CUDA_ARCH__ < CC_TURING
3610
+ mul_mat_q3_K(
3611
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3612
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3613
+
3614
+ #if __CUDA_ARCH__ >= CC_TURING
3615
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3616
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3617
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3618
+
3619
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3620
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3621
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3622
+
3623
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3624
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3625
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3626
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3627
+
3628
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3629
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3631
+ #else
3632
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3633
+ assert(false);
3634
+ #endif // __CUDA_ARCH__ >= CC_TURING
3635
+ }
3636
+
3637
+ #define MMQ_X_Q4_K_AMPERE 64
3638
+ #define MMQ_Y_Q4_K_AMPERE 128
3639
+ #define NWARPS_Q4_K_AMPERE 4
3640
+ #define MMQ_X_Q4_K_PASCAL 64
3641
+ #define MMQ_Y_Q4_K_PASCAL 64
3642
+ #define NWARPS_Q4_K_PASCAL 8
3643
+
3644
+ template <bool need_check> static __global__ void
3645
+ #if __CUDA_ARCH__ < CC_TURING
3646
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3647
+ #endif // __CUDA_ARCH__ < CC_TURING
3648
+ mul_mat_q4_K(
3649
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3650
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3651
+
3652
+ #if __CUDA_ARCH__ >= CC_TURING
3653
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3654
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3655
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3656
+
3657
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3658
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3659
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3660
+
3661
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3662
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3663
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3664
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3665
+
3666
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3667
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3668
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3669
+ #else
3670
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3671
+ assert(false);
3672
+ #endif // __CUDA_ARCH__ >= CC_TURING
3673
+ }
3674
+
3675
+ #define MMQ_X_Q5_K_AMPERE 64
3676
+ #define MMQ_Y_Q5_K_AMPERE 128
3677
+ #define NWARPS_Q5_K_AMPERE 4
3678
+ #define MMQ_X_Q5_K_PASCAL 64
3679
+ #define MMQ_Y_Q5_K_PASCAL 64
3680
+ #define NWARPS_Q5_K_PASCAL 8
3681
+
3682
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3683
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3684
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3685
+
3686
+ #if __CUDA_ARCH__ >= CC_TURING
3687
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3688
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3689
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3690
+
3691
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3692
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3693
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3694
+
3695
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3696
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3697
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3698
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3699
+
3700
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3701
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3702
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3703
+ #else
3704
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3705
+ assert(false);
3706
+ #endif // __CUDA_ARCH__ >= CC_TURING
3707
+ }
3708
+
3709
+ #define MMQ_X_Q6_K_AMPERE 64
3710
+ #define MMQ_Y_Q6_K_AMPERE 64
3711
+ #define NWARPS_Q6_K_AMPERE 4
3712
+ #define MMQ_X_Q6_K_PASCAL 64
3713
+ #define MMQ_Y_Q6_K_PASCAL 64
3714
+ #define NWARPS_Q6_K_PASCAL 8
3715
+
3716
+ template <bool need_check> static __global__ void
3717
+ #if __CUDA_ARCH__ < CC_TURING
3718
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3719
+ #endif // __CUDA_ARCH__ < CC_TURING
3720
+ mul_mat_q6_K(
3721
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3722
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3723
+
3724
+ #if __CUDA_ARCH__ >= CC_TURING
3725
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3726
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3727
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3728
+
3729
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3730
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3731
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3732
+
3733
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3734
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3735
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3736
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3737
+
3738
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3739
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3740
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3741
+ #else
3742
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3743
+ assert(false);
3744
+ #endif // __CUDA_ARCH__ >= CC_TURING
3745
+ }
3746
+
3244
3747
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3245
3748
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3246
3749
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3485,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3485
3988
  // rope == RoPE == rotary positional embedding
3486
3989
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3487
3990
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3488
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3991
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3489
3992
 
3490
3993
  if (col >= ncols) {
3491
3994
  return;
3492
3995
  }
3493
3996
 
3494
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3997
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3495
3998
  const int i = row*ncols + col;
3496
3999
 
3497
4000
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3505,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3505
4008
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3506
4009
  }
3507
4010
 
4011
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4012
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4013
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4014
+
4015
+ if (col >= ncols) {
4016
+ return;
4017
+ }
4018
+
4019
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4020
+ const int i = row*ncols + col/2;
4021
+
4022
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4023
+ const float sin_theta = sinf(theta);
4024
+ const float cos_theta = cosf(theta);
4025
+
4026
+ const float x0 = x[i + 0];
4027
+ const float x1 = x[i + ncols/2];
4028
+
4029
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4030
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4031
+ }
4032
+
3508
4033
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3509
4034
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3510
4035
  const int half_n_dims = ncols/4;
@@ -3539,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3539
4064
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3540
4065
  }
3541
4066
 
3542
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4067
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4068
+ const int n_heads_log2_floor, const float m0, const float m1) {
3543
4069
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4070
+
4071
+ if (col >= ncols) {
4072
+ return;
4073
+ }
4074
+
3544
4075
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4076
+ const int i = row*ncols + col;
4077
+
4078
+ const int k = row/k_rows;
4079
+
4080
+ float m_k;
4081
+ if (k < n_heads_log2_floor) {
4082
+ m_k = powf(m0, k + 1);
4083
+ } else {
4084
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4085
+ }
4086
+
4087
+ dst[i] = col * m_k + x[i];
4088
+ }
4089
+
4090
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4091
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4092
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3545
4093
 
3546
4094
  if (col >= ncols) {
3547
4095
  return;
@@ -3554,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3554
4102
 
3555
4103
  // the CUDA soft max implementation differs from the CPU implementation
3556
4104
  // instead of doubles floats are used
3557
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3558
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3559
4105
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3560
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3561
- const int block_size = blockDim.x;
3562
- const int tid = threadIdx.x;
4106
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4107
+ const int block_size = blockDim.y;
4108
+ const int tid = threadIdx.y;
3563
4109
 
3564
- float tmp = 0.0;
4110
+ float max_val = -INFINITY;
3565
4111
 
3566
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3567
- const int col = block_start + tid;
4112
+ for (int col = tid; col < ncols; col += block_size) {
4113
+ const int i = row*ncols + col;
4114
+ max_val = max(max_val, x[i]);
4115
+ }
3568
4116
 
3569
- if (col >= ncols) {
3570
- break;
3571
- }
4117
+ // find the max value in the block
4118
+ #pragma unroll
4119
+ for (int mask = 16; mask > 0; mask >>= 1) {
4120
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4121
+ }
3572
4122
 
4123
+ float tmp = 0.f;
4124
+
4125
+ for (int col = tid; col < ncols; col += block_size) {
3573
4126
  const int i = row*ncols + col;
3574
- const float val = expf(x[i]);
4127
+ const float val = expf(x[i] - max_val);
3575
4128
  tmp += val;
3576
4129
  dst[i] = val;
3577
4130
  }
@@ -3582,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3582
4135
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3583
4136
  }
3584
4137
 
3585
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3586
- const int col = block_start + tid;
3587
-
3588
- if (col >= ncols) {
3589
- break;
3590
- }
4138
+ const float inv_tmp = 1.f / tmp;
3591
4139
 
4140
+ for (int col = tid; col < ncols; col += block_size) {
3592
4141
  const int i = row*ncols + col;
3593
- dst[i] /= tmp;
4142
+ dst[i] *= inv_tmp;
3594
4143
  }
3595
4144
  }
3596
4145
 
@@ -3942,48 +4491,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3942
4491
  CUDA_CHECK(cudaGetDevice(&id));
3943
4492
  const int compute_capability = g_compute_capabilities[id];
3944
4493
 
4494
+ int mmq_x, mmq_y, nwarps;
3945
4495
  if (compute_capability >= CC_TURING) {
3946
- const int mmq_x = 64;
3947
- const int mmq_y = 128;
3948
- const int nwarps = 4;
3949
-
3950
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
- const dim3 block_nums(block_num_x, block_num_y, 1);
3953
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
-
3955
- if (nrows_x % mmq_y == 0) {
3956
- const bool need_check = false;
3957
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
- } else {
3961
- const bool need_check = true;
3962
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
- }
4496
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4498
+ nwarps = NWARPS_Q4_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4502
+ nwarps = NWARPS_Q4_0_PASCAL;
3966
4503
  } else {
3967
- const int mmq_x = 64;
3968
- const int mmq_y = 64;
3969
- const int nwarps = 4;
3970
-
3971
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
- const dim3 block_nums(block_num_x, block_num_y, 1);
3974
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
-
3976
- if (nrows_x % mmq_y == 0) {
3977
- const bool need_check = false;
3978
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
- } else {
3982
- const bool need_check = true;
3983
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
- }
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4509
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4511
+
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4516
+ } else {
4517
+ const bool need_check = true;
4518
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3987
4520
  }
3988
4521
  }
3989
4522
 
@@ -3995,49 +4528,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3995
4528
  CUDA_CHECK(cudaGetDevice(&id));
3996
4529
  const int compute_capability = g_compute_capabilities[id];
3997
4530
 
4531
+ int mmq_x, mmq_y, nwarps;
3998
4532
  if (compute_capability >= CC_TURING) {
3999
- const int mmq_x = 64;
4000
- const int mmq_y = 128;
4001
- const int nwarps = 4;
4002
-
4003
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
- const dim3 block_nums(block_num_x, block_num_y, 1);
4006
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
-
4008
- if (nrows_x % mmq_y == 0) {
4009
- const bool need_check = false;
4010
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
- } else {
4014
- const bool need_check = true;
4015
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
- }
4533
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4534
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4535
+ nwarps = NWARPS_Q4_1_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4538
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4539
+ nwarps = NWARPS_Q4_1_PASCAL;
4019
4540
  } else {
4020
- const int mmq_x = 64;
4021
- const int mmq_y = 64;
4022
- const int nwarps = 8;
4023
-
4024
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
- const dim3 block_nums(block_num_x, block_num_y, 1);
4027
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
-
4029
- if (nrows_x % mmq_y == 0) {
4030
- const bool need_check = false;
4031
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
- } else {
4035
- const bool need_check = true;
4036
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
- }
4541
+ GGML_ASSERT(false);
4542
+ }
4040
4543
 
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4546
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4548
+
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4553
+ } else {
4554
+ const bool need_check = true;
4555
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4041
4557
  }
4042
4558
  }
4043
4559
 
@@ -4049,48 +4565,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4049
4565
  CUDA_CHECK(cudaGetDevice(&id));
4050
4566
  const int compute_capability = g_compute_capabilities[id];
4051
4567
 
4568
+ int mmq_x, mmq_y, nwarps;
4052
4569
  if (compute_capability >= CC_TURING) {
4053
- const int mmq_x = 128;
4054
- const int mmq_y = 64;
4055
- const int nwarps = 4;
4056
-
4057
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
- const dim3 block_nums(block_num_x, block_num_y, 1);
4060
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
-
4062
- if (nrows_x % mmq_y == 0) {
4063
- const bool need_check = false;
4064
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
- } else {
4068
- const bool need_check = true;
4069
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
- }
4570
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4571
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4572
+ nwarps = NWARPS_Q5_0_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4575
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4576
+ nwarps = NWARPS_Q5_0_PASCAL;
4073
4577
  } else {
4074
- const int mmq_x = 64;
4075
- const int mmq_y = 64;
4076
- const int nwarps = 8;
4077
-
4078
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
- const dim3 block_nums(block_num_x, block_num_y, 1);
4081
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
-
4083
- if (nrows_x % mmq_y == 0) {
4084
- const bool need_check = false;
4085
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
- } else {
4089
- const bool need_check = true;
4090
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
- }
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4583
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4585
+
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4590
+ } else {
4591
+ const bool need_check = true;
4592
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4094
4594
  }
4095
4595
  }
4096
4596
 
@@ -4102,48 +4602,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4102
4602
  CUDA_CHECK(cudaGetDevice(&id));
4103
4603
  const int compute_capability = g_compute_capabilities[id];
4104
4604
 
4605
+ int mmq_x, mmq_y, nwarps;
4105
4606
  if (compute_capability >= CC_TURING) {
4106
- const int mmq_x = 128;
4107
- const int mmq_y = 64;
4108
- const int nwarps = 8;
4109
-
4110
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
- const dim3 block_nums(block_num_x, block_num_y, 1);
4113
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
-
4115
- if (nrows_x % mmq_y == 0) {
4116
- const bool need_check = false;
4117
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
- } else {
4121
- const bool need_check = true;
4122
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
- }
4607
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4608
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4609
+ nwarps = NWARPS_Q5_1_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4612
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4613
+ nwarps = NWARPS_Q5_1_PASCAL;
4126
4614
  } else {
4127
- const int mmq_x = 64;
4128
- const int mmq_y = 64;
4129
- const int nwarps = 8;
4130
-
4131
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
- const dim3 block_nums(block_num_x, block_num_y, 1);
4134
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
-
4136
- if (nrows_x % mmq_y == 0) {
4137
- const bool need_check = false;
4138
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
- } else {
4142
- const bool need_check = true;
4143
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
- }
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4620
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4622
+
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4627
+ } else {
4628
+ const bool need_check = true;
4629
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4147
4631
  }
4148
4632
  }
4149
4633
 
@@ -4155,48 +4639,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4155
4639
  CUDA_CHECK(cudaGetDevice(&id));
4156
4640
  const int compute_capability = g_compute_capabilities[id];
4157
4641
 
4642
+ int mmq_x, mmq_y, nwarps;
4158
4643
  if (compute_capability >= CC_TURING) {
4159
- const int mmq_x = 128;
4160
- const int mmq_y = 64;
4161
- const int nwarps = 4;
4162
-
4163
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
- const dim3 block_nums(block_num_x, block_num_y, 1);
4166
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
-
4168
- if (nrows_x % mmq_y == 0) {
4169
- const bool need_check = false;
4170
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
- } else {
4174
- const bool need_check = true;
4175
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
- }
4644
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4645
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4646
+ nwarps = NWARPS_Q8_0_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4649
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4650
+ nwarps = NWARPS_Q8_0_PASCAL;
4179
4651
  } else {
4180
- const int mmq_x = 64;
4181
- const int mmq_y = 64;
4182
- const int nwarps = 8;
4183
-
4184
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
- const dim3 block_nums(block_num_x, block_num_y, 1);
4187
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
-
4189
- if (nrows_x % mmq_y == 0) {
4190
- const bool need_check = false;
4191
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
- } else {
4195
- const bool need_check = true;
4196
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
- }
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4657
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4659
+
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4664
+ } else {
4665
+ const bool need_check = true;
4666
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4200
4668
  }
4201
4669
  }
4202
4670
 
@@ -4208,48 +4676,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4208
4676
  CUDA_CHECK(cudaGetDevice(&id));
4209
4677
  const int compute_capability = g_compute_capabilities[id];
4210
4678
 
4679
+ int mmq_x, mmq_y, nwarps;
4211
4680
  if (compute_capability >= CC_TURING) {
4212
- const int mmq_x = 64;
4213
- const int mmq_y = 128;
4214
- const int nwarps = 4;
4215
-
4216
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
- const dim3 block_nums(block_num_x, block_num_y, 1);
4219
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
-
4221
- if (nrows_x % mmq_y == 0) {
4222
- const bool need_check = false;
4223
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
- } else {
4227
- const bool need_check = true;
4228
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
- }
4681
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4683
+ nwarps = NWARPS_Q2_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4687
+ nwarps = NWARPS_Q2_K_PASCAL;
4232
4688
  } else {
4233
- const int mmq_x = 64;
4234
- const int mmq_y = 64;
4235
- const int nwarps = 8;
4236
-
4237
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
- const dim3 block_nums(block_num_x, block_num_y, 1);
4240
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
-
4242
- if (nrows_x % mmq_y == 0) {
4243
- const bool need_check = false;
4244
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
- } else {
4248
- const bool need_check = true;
4249
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
- }
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4694
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4696
+
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4701
+ } else {
4702
+ const bool need_check = true;
4703
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4253
4705
  }
4254
4706
  }
4255
4707
 
@@ -4261,48 +4713,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4261
4713
  CUDA_CHECK(cudaGetDevice(&id));
4262
4714
  const int compute_capability = g_compute_capabilities[id];
4263
4715
 
4716
+ int mmq_x, mmq_y, nwarps;
4264
4717
  if (compute_capability >= CC_TURING) {
4265
- const int mmq_x = 128;
4266
- const int mmq_y = 128;
4267
- const int nwarps = 4;
4268
-
4269
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
- const dim3 block_nums(block_num_x, block_num_y, 1);
4272
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
-
4274
- if (nrows_x % mmq_y == 0) {
4275
- const bool need_check = false;
4276
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
- } else {
4280
- const bool need_check = true;
4281
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
- }
4718
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4719
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4720
+ nwarps = NWARPS_Q3_K_AMPERE;
4721
+ } else if (compute_capability >= MIN_CC_DP4A) {
4722
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4723
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4724
+ nwarps = NWARPS_Q3_K_PASCAL;
4285
4725
  } else {
4286
- const int mmq_x = 64;
4287
- const int mmq_y = 64;
4288
- const int nwarps = 8;
4289
-
4290
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
- const dim3 block_nums(block_num_x, block_num_y, 1);
4293
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
-
4295
- if (nrows_x % mmq_y == 0) {
4296
- const bool need_check = false;
4297
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
- } else {
4301
- const bool need_check = true;
4302
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
- }
4726
+ GGML_ASSERT(false);
4727
+ }
4728
+
4729
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4730
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4731
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4732
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4733
+
4734
+ if (nrows_x % mmq_y == 0) {
4735
+ const bool need_check = false;
4736
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4737
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4738
+ } else {
4739
+ const bool need_check = true;
4740
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4306
4742
  }
4307
4743
  }
4308
4744
 
@@ -4314,48 +4750,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4314
4750
  CUDA_CHECK(cudaGetDevice(&id));
4315
4751
  const int compute_capability = g_compute_capabilities[id];
4316
4752
 
4753
+ int mmq_x, mmq_y, nwarps;
4317
4754
  if (compute_capability >= CC_TURING) {
4318
- const int mmq_x = 64;
4319
- const int mmq_y = 128;
4320
- const int nwarps = 4;
4321
-
4322
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
- const dim3 block_nums(block_num_x, block_num_y, 1);
4325
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
-
4327
- if (nrows_x % mmq_y == 0) {
4328
- const bool need_check = false;
4329
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
- } else {
4333
- const bool need_check = true;
4334
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
- }
4755
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4756
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4757
+ nwarps = NWARPS_Q4_K_AMPERE;
4758
+ } else if (compute_capability >= MIN_CC_DP4A) {
4759
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4760
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4761
+ nwarps = NWARPS_Q4_K_PASCAL;
4338
4762
  } else {
4339
- const int mmq_x = 32;
4340
- const int mmq_y = 64;
4341
- const int nwarps = 8;
4342
-
4343
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
- const dim3 block_nums(block_num_x, block_num_y, 1);
4346
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
-
4348
- if (nrows_x % mmq_y == 0) {
4349
- const bool need_check = false;
4350
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
- } else {
4354
- const bool need_check = true;
4355
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
- }
4763
+ GGML_ASSERT(false);
4764
+ }
4765
+
4766
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4767
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4768
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4769
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4770
+
4771
+ if (nrows_x % mmq_y == 0) {
4772
+ const bool need_check = false;
4773
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4774
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4775
+ } else {
4776
+ const bool need_check = true;
4777
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4778
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4779
  }
4360
4780
  }
4361
4781
 
@@ -4367,48 +4787,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4367
4787
  CUDA_CHECK(cudaGetDevice(&id));
4368
4788
  const int compute_capability = g_compute_capabilities[id];
4369
4789
 
4790
+ int mmq_x, mmq_y, nwarps;
4370
4791
  if (compute_capability >= CC_TURING) {
4371
- const int mmq_x = 64;
4372
- const int mmq_y = 128;
4373
- const int nwarps = 4;
4374
-
4375
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
- const dim3 block_nums(block_num_x, block_num_y, 1);
4378
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
-
4380
- if (nrows_x % mmq_y == 0) {
4381
- const bool need_check = false;
4382
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
- } else {
4386
- const bool need_check = true;
4387
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
- }
4792
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4793
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4794
+ nwarps = NWARPS_Q5_K_AMPERE;
4795
+ } else if (compute_capability >= MIN_CC_DP4A) {
4796
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4797
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4798
+ nwarps = NWARPS_Q5_K_PASCAL;
4391
4799
  } else {
4392
- const int mmq_x = 64;
4393
- const int mmq_y = 64;
4394
- const int nwarps = 8;
4395
-
4396
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
- const dim3 block_nums(block_num_x, block_num_y, 1);
4399
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
-
4401
- if (nrows_x % mmq_y == 0) {
4402
- const bool need_check = false;
4403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
- } else {
4407
- const bool need_check = true;
4408
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
- }
4800
+ GGML_ASSERT(false);
4801
+ }
4802
+
4803
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4804
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4805
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4806
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4807
+
4808
+ if (nrows_x % mmq_y == 0) {
4809
+ const bool need_check = false;
4810
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4811
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4812
+ } else {
4813
+ const bool need_check = true;
4814
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4815
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4412
4816
  }
4413
4817
  }
4414
4818
 
@@ -4420,48 +4824,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4420
4824
  CUDA_CHECK(cudaGetDevice(&id));
4421
4825
  const int compute_capability = g_compute_capabilities[id];
4422
4826
 
4827
+ int mmq_x, mmq_y, nwarps;
4423
4828
  if (compute_capability >= CC_TURING) {
4424
- const int mmq_x = 64;
4425
- const int mmq_y = 64;
4426
- const int nwarps = 4;
4427
-
4428
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
- const dim3 block_nums(block_num_x, block_num_y, 1);
4431
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
-
4433
- if (nrows_x % mmq_y == 0) {
4434
- const bool need_check = false;
4435
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
- } else {
4439
- const bool need_check = true;
4440
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
- }
4829
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4830
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4831
+ nwarps = NWARPS_Q6_K_AMPERE;
4832
+ } else if (compute_capability >= MIN_CC_DP4A) {
4833
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4834
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4835
+ nwarps = NWARPS_Q6_K_PASCAL;
4444
4836
  } else {
4445
- const int mmq_x = 32;
4446
- const int mmq_y = 64;
4447
- const int nwarps = 8;
4448
-
4449
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
- const dim3 block_nums(block_num_x, block_num_y, 1);
4452
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
-
4454
- if (nrows_x % mmq_y == 0) {
4455
- const bool need_check = false;
4456
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
- } else {
4460
- const bool need_check = true;
4461
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
- }
4837
+ GGML_ASSERT(false);
4838
+ }
4839
+
4840
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4841
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4842
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4843
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4844
+
4845
+ if (nrows_x % mmq_y == 0) {
4846
+ const bool need_check = false;
4847
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4848
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4849
+ } else {
4850
+ const bool need_check = true;
4851
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4852
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4465
4853
  }
4466
4854
  }
4467
4855
 
@@ -4511,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4511
4899
 
4512
4900
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4513
4901
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4514
- GGML_ASSERT(nrows % 2 == 0);
4515
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4902
+ GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4516
4904
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4517
- const dim3 block_nums(num_blocks_x, nrows, 1);
4905
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4518
4906
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4519
4907
  }
4520
4908
 
4909
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4912
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4914
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4915
+ }
4916
+
4521
4917
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4522
4918
  GGML_ASSERT(nrows % 4 == 0);
4523
4919
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4526,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4526
4922
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4527
4923
  }
4528
4924
 
4925
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4926
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4927
+ const float m1, cudaStream_t stream) {
4928
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4929
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4930
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4931
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4932
+ }
4933
+
4529
4934
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4530
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4935
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4531
4936
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4532
- const dim3 block_nums(block_num_x, nrows_x, 1);
4937
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4533
4938
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4534
4939
  }
4535
4940
 
4536
4941
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4537
- const dim3 block_dims(WARP_SIZE, 1, 1);
4538
- const dim3 block_nums(1, nrows_x, 1);
4942
+ const dim3 block_dims(1, WARP_SIZE, 1);
4943
+ const dim3 block_nums(nrows_x, 1, 1);
4539
4944
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4540
4945
  }
4541
4946
 
@@ -4640,10 +5045,18 @@ void ggml_init_cublas() {
4640
5045
  static bool initialized = false;
4641
5046
 
4642
5047
  if (!initialized) {
5048
+
5049
+ #ifdef __HIP_PLATFORM_AMD__
5050
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5051
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5052
+ rocblas_initialize();
5053
+ CUDA_CHECK(cudaDeviceSynchronize());
5054
+ #endif
5055
+
4643
5056
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4644
5057
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4645
5058
  int64_t total_vram = 0;
4646
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5059
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4647
5060
  for (int id = 0; id < g_device_count; ++id) {
4648
5061
  cudaDeviceProp prop;
4649
5062
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5241,7 +5654,8 @@ inline void ggml_cuda_op_rope(
5241
5654
 
5242
5655
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5243
5656
 
5244
- const bool is_glm = mode & 4;
5657
+ const bool is_neox = mode & 2;
5658
+ const bool is_glm = mode & 4;
5245
5659
 
5246
5660
  // compute
5247
5661
  if (is_glm) {
@@ -5249,6 +5663,10 @@ inline void ggml_cuda_op_rope(
5249
5663
  const float id_p = min(p, n_ctx - 2.f);
5250
5664
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5251
5665
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5666
+ } else if (is_neox) {
5667
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5668
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5669
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5252
5670
  } else {
5253
5671
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5254
5672
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5261,6 +5679,41 @@ inline void ggml_cuda_op_rope(
5261
5679
  (void) i1;
5262
5680
  }
5263
5681
 
5682
+ inline void ggml_cuda_op_alibi(
5683
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5684
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5685
+ cudaStream_t & cudaStream_main){
5686
+
5687
+ GGML_ASSERT(src0_ddf_i != nullptr);
5688
+ GGML_ASSERT(dst_ddf_i != nullptr);
5689
+
5690
+ const int64_t ne00 = src0->ne[0];
5691
+ const int64_t ne01 = src0->ne[1];
5692
+ const int64_t ne02 = src0->ne[2];
5693
+ const int64_t i01_diff = i01_high - i01_low;
5694
+
5695
+ const int n_past = ((int32_t *) dst->op_params)[0];
5696
+ const int n_head = ((int32_t *) dst->op_params)[1];
5697
+ float max_bias;
5698
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5699
+
5700
+ GGML_ASSERT(ne01 + n_past == ne00);
5701
+ GGML_ASSERT(n_head == ne02);
5702
+
5703
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5704
+
5705
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5706
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5707
+
5708
+ // compute
5709
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5710
+
5711
+ (void) src1;
5712
+ (void) src0_ddq_i;
5713
+ (void) src1_ddf_i;
5714
+ (void) i1;
5715
+ }
5716
+
5264
5717
  inline void ggml_cuda_op_diag_mask_inf(
5265
5718
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5266
5719
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -5881,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
5881
6334
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
5882
6335
  }
5883
6336
 
6337
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6338
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6339
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6340
+ }
6341
+
5884
6342
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5885
6343
  (void) src0;
5886
6344
  (void) src1;
@@ -6000,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6000
6458
  return extra;
6001
6459
  }
6002
6460
 
6003
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6461
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6004
6462
  if (scratch && g_scratch_size == 0) {
6005
6463
  return;
6006
6464
  }
@@ -6009,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6009
6467
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6010
6468
  const ggml_op src0_op = tensor->src[0]->op;
6011
6469
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6012
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6470
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6013
6471
  }
6014
6472
  }
6015
6473
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6016
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6474
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6017
6475
  }
6018
6476
 
6019
6477
  tensor->backend = GGML_BACKEND_GPU;
6478
+
6479
+ if (scratch && no_alloc) {
6480
+ return;
6481
+ }
6482
+
6020
6483
  struct ggml_tensor_extra_gpu * extra;
6021
6484
 
6022
6485
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6068,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6068
6531
  tensor->extra = extra;
6069
6532
  }
6070
6533
 
6534
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6535
+ if (g_scratch_size == 0) {
6536
+ return;
6537
+ }
6538
+ if (g_scratch_buffer == nullptr) {
6539
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6540
+ }
6541
+
6542
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6543
+
6544
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6545
+ tensor->op == GGML_OP_VIEW;
6546
+
6547
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6548
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6549
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6550
+ size_t view_offset = 0;
6551
+ if (tensor->op == GGML_OP_VIEW) {
6552
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6553
+ }
6554
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6555
+ } else {
6556
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6557
+ }
6558
+
6559
+ tensor->extra = extra;
6560
+ }
6561
+
6071
6562
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6072
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6563
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6564
+ }
6565
+
6566
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6567
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6073
6568
  }
6074
6569
 
6075
6570
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6076
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6571
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6077
6572
  }
6078
6573
 
6079
6574
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6080
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6575
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6081
6576
  }
6082
6577
 
6083
6578
  void ggml_cuda_set_main_device(int main_device) {
@@ -6216,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6216
6711
  }
6217
6712
  func = ggml_cuda_rope;
6218
6713
  break;
6714
+ case GGML_OP_ALIBI:
6715
+ if (!any_on_device) {
6716
+ return false;
6717
+ }
6718
+ func = ggml_cuda_alibi;
6719
+ break;
6219
6720
  default:
6220
6721
  return false;
6221
6722
  }
@@ -6229,3 +6730,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6229
6730
  func(tensor->src[0], tensor->src[1], tensor);
6230
6731
  return true;
6231
6732
  }
6733
+
6734
+ int ggml_cuda_get_device_count() {
6735
+ int device_count;
6736
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6737
+ return device_count;
6738
+ }
6739
+
6740
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6741
+ cudaDeviceProp prop;
6742
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6743
+ snprintf(description, description_size, "%s", prop.name);
6744
+ }