llama_cpp 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1399,6 +1501,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1399
1501
  // second part effectively subtracts 8 from each quant value
1400
1502
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1401
1503
  #else
1504
+ assert(false);
1402
1505
  return 0.0f; // only to satisfy the compiler
1403
1506
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1404
1507
  }
@@ -1436,6 +1539,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1436
1539
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
1540
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1438
1541
  #else
1542
+ assert(false);
1439
1543
  return 0.0f; // only to satisfy the compiler
1440
1544
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1441
1545
  }
@@ -1471,6 +1575,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1471
1575
  // second part effectively subtracts 16 from each quant value
1472
1576
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1473
1577
  #else
1578
+ assert(false);
1474
1579
  return 0.0f; // only to satisfy the compiler
1475
1580
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1476
1581
  }
@@ -1516,6 +1621,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1516
1621
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1517
1622
 
1518
1623
  #else
1624
+ assert(false);
1519
1625
  return 0.0f; // only to satisfy the compiler
1520
1626
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1521
1627
  }
@@ -1537,6 +1643,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
1537
1643
 
1538
1644
  return d8_0*d8_1 * sumi;
1539
1645
  #else
1646
+ assert(false);
1540
1647
  return 0.0f; // only to satisfy the compiler
1541
1648
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1542
1649
  }
@@ -1567,6 +1674,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1567
1674
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
1675
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
1676
  #else
1677
+ assert(false);
1570
1678
  return 0.0f; // only to satisfy the compiler
1571
1679
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
1680
  }
@@ -1602,6 +1710,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1602
1710
 
1603
1711
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
1712
  #else
1713
+ assert(false);
1605
1714
  return 0.0f; // only to satisfy the compiler
1606
1715
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
1716
  }
@@ -1639,6 +1748,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1639
1748
 
1640
1749
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
1750
  #else
1751
+ assert(false);
1642
1752
  return 0.0f; // only to satisfy the compiler
1643
1753
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
1754
  }
@@ -1679,6 +1789,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1679
1789
 
1680
1790
  return d3 * sumf;
1681
1791
  #else
1792
+ assert(false);
1682
1793
  return 0.0f; // only to satisfy the compiler
1683
1794
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
1795
  }
@@ -1704,6 +1815,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1704
1815
 
1705
1816
  return d3*d8 * sumi;
1706
1817
  #else
1818
+ assert(false);
1707
1819
  return 0.0f; // only to satisfy the compiler
1708
1820
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
1821
  }
@@ -1737,12 +1849,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1737
1849
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
1850
 
1739
1851
  #else
1852
+ assert(false);
1740
1853
  return 0.0f; // only to satisfy the compiler
1741
1854
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
1855
  }
1743
1856
 
1744
1857
  // contiguous u/y values
1745
- // also used for q5_K
1746
1858
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
1859
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
1860
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
@@ -1752,19 +1864,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1752
1864
  float sumf_m = 0.0f;
1753
1865
 
1754
1866
  #pragma unroll
1755
- for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1867
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1756
1868
  int sumi_d = 0;
1757
1869
 
1758
1870
  #pragma unroll
1759
- for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
- sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
- sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1871
+ for (int j = 0; j < QI8_1; ++j) {
1872
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1762
1873
  }
1763
1874
 
1764
- const float2 ds8f = __half22float2(ds8[i0 / 4]);
1875
+ const float2 ds8f = __half22float2(ds8[i]);
1765
1876
 
1766
- sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
- sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1877
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1878
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1768
1879
  }
1769
1880
 
1770
1881
  const float2 dm4f = __half22float2(dm4);
@@ -1772,6 +1883,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1772
1883
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
1884
 
1774
1885
  #else
1886
+ assert(false);
1775
1887
  return 0.0f; // only to satisfy the compiler
1776
1888
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
1889
  }
@@ -1780,7 +1892,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1780
1892
  #define VDR_Q5_K_Q8_1_MMQ 8
1781
1893
 
1782
1894
  // contiguous v/x values
1783
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1895
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1784
1896
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
1897
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
1898
 
@@ -1812,6 +1924,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1812
1924
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
1925
 
1814
1926
  #else
1927
+ assert(false);
1928
+ return 0.0f; // only to satisfy the compiler
1929
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1930
+ }
1931
+
1932
+ // contiguous u/y values
1933
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1934
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1935
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1936
+
1937
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1938
+ float sumf_d = 0.0f;
1939
+ float sumf_m = 0.0f;
1940
+
1941
+ #pragma unroll
1942
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1943
+ int sumi_d = 0;
1944
+
1945
+ #pragma unroll
1946
+ for (int j = 0; j < QI8_1; ++j) {
1947
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1948
+ }
1949
+
1950
+ const float2 ds8f = __half22float2(ds8[i]);
1951
+
1952
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1953
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1954
+ }
1955
+
1956
+ const float2 dm4f = __half22float2(dm4);
1957
+
1958
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1959
+
1960
+ #else
1961
+ assert(false);
1815
1962
  return 0.0f; // only to satisfy the compiler
1816
1963
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
1964
  }
@@ -1842,6 +1989,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1842
1989
 
1843
1990
  return d*sumf;
1844
1991
  #else
1992
+ assert(false);
1845
1993
  return 0.0f; // only to satisfy the compiler
1846
1994
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
1995
  }
@@ -1873,6 +2021,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1873
2021
  return d6 * sumf_d;
1874
2022
 
1875
2023
  #else
2024
+ assert(false);
1876
2025
  return 0.0f; // only to satisfy the compiler
1877
2026
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
2027
  }
@@ -2298,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2298
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2299
2448
  }
2300
2449
 
2301
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2302
2451
  }
2303
2452
 
2304
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2384,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2384
2533
  #pragma unroll
2385
2534
  for (int i = 0; i < QR2_K; ++ i) {
2386
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2387
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2388
2537
  }
2389
2538
 
2390
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2503,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2503
2652
  #pragma unroll
2504
2653
  for (int i = 0; i < QR3_K; ++i) {
2505
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2506
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2507
2656
  }
2508
2657
 
2509
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2672,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2672
2821
 
2673
2822
  for (int i = 0; i < QR4_K; ++i) {
2674
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2675
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2676
2825
 
2677
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2678
2827
  u[2*i+0] = q8[0];
@@ -2699,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2699
2848
  const float dall = bq4_K->d[0];
2700
2849
  const float dmin = bq4_K->d[1];
2701
2850
 
2702
- const float d8_1 = bq8_1[0].ds.x;
2703
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2704
2853
 
2705
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2706
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2722,6 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2722
2871
  return dall * sumf_d - dmin * sumf_m;
2723
2872
 
2724
2873
  #else
2874
+ assert(false);
2725
2875
  return 0.0f; // only to satisfy the compiler
2726
2876
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
2877
 
@@ -2808,18 +2958,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
2958
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
2959
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
2960
 
2811
- int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
-
2813
- #pragma unroll
2814
- for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
- v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
- v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
- }
2818
-
2819
2961
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
2962
 
2821
2963
  const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
- return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2964
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2965
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
2966
  }
2824
2967
 
2825
2968
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2859,14 +3002,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2859
3002
  #pragma unroll
2860
3003
  for (int i = 0; i < QR5_K; ++i) {
2861
3004
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2862
- d8[i] = bq8i->ds.x;
3005
+ d8[i] = __low2float(bq8i->ds);
2863
3006
 
2864
3007
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2865
3008
  u[2*i+0] = q8[0];
2866
3009
  u[2*i+1] = q8[4];
2867
3010
  }
2868
3011
 
2869
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
3012
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
3013
 
2871
3014
  #else
2872
3015
 
@@ -2877,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2877
3020
 
2878
3021
  const float d = bq5_K->d;
2879
3022
 
2880
- const float d8_1 = bq8_1[0].ds.x;
2881
- const float d8_2 = bq8_1[1].ds.x;
3023
+ const float d8_1 = __low2half(bq8_1[0].ds);
3024
+ const float d8_2 = __low2half(bq8_1[1].ds);
2882
3025
 
2883
3026
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2884
3027
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2905,6 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2905
3048
  return d * sumf_d;
2906
3049
 
2907
3050
  #else
3051
+ assert(false);
2908
3052
  return 0.0f; // only to satisfy the compiler
2909
3053
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
3054
 
@@ -3008,7 +3152,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3008
3152
 
3009
3153
  const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
3154
  const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3155
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3156
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
3157
  }
3013
3158
 
3014
3159
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -3031,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3031
3176
  #pragma unroll
3032
3177
  for (int i = 0; i < QR6_K; ++i) {
3033
3178
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3034
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3179
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3035
3180
  }
3036
3181
 
3037
3182
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3135,7 +3280,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3135
3280
 
3136
3281
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
3282
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
- static __global__ void mul_mat_q(
3283
+ static __device__ __forceinline__ void mul_mat_q(
3139
3284
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
3285
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
3286
 
@@ -3150,7 +3295,6 @@ static __global__ void mul_mat_q(
3150
3295
 
3151
3296
  const int row_dst_0 = blockIdx.x*mmq_y;
3152
3297
  const int & row_x_0 = row_dst_0;
3153
- const int row_dst = row_dst_0 + threadIdx.x;
3154
3298
 
3155
3299
  const int col_dst_0 = blockIdx.y*mmq_x;
3156
3300
  const int & col_y_0 = col_dst_0;
@@ -3200,7 +3344,7 @@ static __global__ void mul_mat_q(
3200
3344
  *dsi_dst = *dsi_src;
3201
3345
  } else {
3202
3346
  float * dfi_dst = (float *) dsi_dst;
3203
- *dfi_dst = (*dsi_src).x;
3347
+ *dfi_dst = __low2half(*dsi_src);
3204
3348
  }
3205
3349
  }
3206
3350
 
@@ -3223,11 +3367,7 @@ static __global__ void mul_mat_q(
3223
3367
  }
3224
3368
  }
3225
3369
 
3226
-
3227
- if (row_dst >= nrows_dst) {
3228
- return;
3229
- }
3230
-
3370
+ #pragma unroll
3231
3371
  for (int j = 0; j < mmq_x; j += nwarps) {
3232
3372
  const int col_dst = col_dst_0 + j + threadIdx.y;
3233
3373
 
@@ -3235,12 +3375,375 @@ static __global__ void mul_mat_q(
3235
3375
  return;
3236
3376
  }
3237
3377
 
3378
+ #pragma unroll
3238
3379
  for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3380
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3381
+
3382
+ if (row_dst >= nrows_dst) {
3383
+ continue;
3384
+ }
3385
+
3386
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3240
3387
  }
3241
3388
  }
3242
3389
  }
3243
3390
 
3391
+ #define MMQ_X_Q4_0_AMPERE 64
3392
+ #define MMQ_Y_Q4_0_AMPERE 128
3393
+ #define NWARPS_Q4_0_AMPERE 4
3394
+ #define MMQ_X_Q4_0_PASCAL 64
3395
+ #define MMQ_Y_Q4_0_PASCAL 64
3396
+ #define NWARPS_Q4_0_PASCAL 8
3397
+
3398
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3399
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3400
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3401
+
3402
+ #if __CUDA_ARCH__ >= CC_TURING
3403
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3404
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3405
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3406
+
3407
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3408
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3409
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3410
+
3411
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3412
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3413
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3414
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3415
+
3416
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3417
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3418
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3419
+ #else
3420
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3421
+ assert(false);
3422
+ #endif // __CUDA_ARCH__ >= CC_TURING
3423
+ }
3424
+
3425
+ #define MMQ_X_Q4_1_AMPERE 64
3426
+ #define MMQ_Y_Q4_1_AMPERE 128
3427
+ #define NWARPS_Q4_1_AMPERE 4
3428
+ #define MMQ_X_Q4_1_PASCAL 64
3429
+ #define MMQ_Y_Q4_1_PASCAL 64
3430
+ #define NWARPS_Q4_1_PASCAL 8
3431
+
3432
+ template <bool need_check> static __global__ void
3433
+ #if __CUDA_ARCH__ < CC_TURING
3434
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3435
+ #endif // __CUDA_ARCH__ < CC_TURING
3436
+ mul_mat_q4_1(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3443
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3444
+
3445
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3446
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3452
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3453
+
3454
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3455
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q5_0_AMPERE 128
3464
+ #define MMQ_Y_Q5_0_AMPERE 64
3465
+ #define NWARPS_Q5_0_AMPERE 4
3466
+ #define MMQ_X_Q5_0_PASCAL 64
3467
+ #define MMQ_Y_Q5_0_PASCAL 64
3468
+ #define NWARPS_Q5_0_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3477
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3478
+
3479
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3480
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3486
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3487
+
3488
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3489
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q5_1_AMPERE 128
3498
+ #define MMQ_Y_Q5_1_AMPERE 64
3499
+ #define NWARPS_Q5_1_AMPERE 4
3500
+ #define MMQ_X_Q5_1_PASCAL 64
3501
+ #define MMQ_Y_Q5_1_PASCAL 64
3502
+ #define NWARPS_Q5_1_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3505
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3506
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3507
+
3508
+ #if __CUDA_ARCH__ >= CC_TURING
3509
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3510
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3511
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3512
+
3513
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3514
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3516
+
3517
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3518
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3519
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3520
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3521
+
3522
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3523
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3524
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3525
+ #else
3526
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3527
+ assert(false);
3528
+ #endif // __CUDA_ARCH__ >= CC_TURING
3529
+ }
3530
+
3531
+ #define MMQ_X_Q8_0_AMPERE 128
3532
+ #define MMQ_Y_Q8_0_AMPERE 64
3533
+ #define NWARPS_Q8_0_AMPERE 4
3534
+ #define MMQ_X_Q8_0_PASCAL 64
3535
+ #define MMQ_Y_Q8_0_PASCAL 64
3536
+ #define NWARPS_Q8_0_PASCAL 8
3537
+
3538
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3539
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3540
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3541
+
3542
+ #if __CUDA_ARCH__ >= CC_TURING
3543
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3544
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3545
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3546
+
3547
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3548
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3549
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3550
+
3551
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3552
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3553
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3554
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3555
+
3556
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3557
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3558
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3559
+ #else
3560
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3561
+ assert(false);
3562
+ #endif // __CUDA_ARCH__ >= CC_TURING
3563
+ }
3564
+
3565
+ #define MMQ_X_Q2_K_AMPERE 64
3566
+ #define MMQ_Y_Q2_K_AMPERE 128
3567
+ #define NWARPS_Q2_K_AMPERE 4
3568
+ #define MMQ_X_Q2_K_PASCAL 64
3569
+ #define MMQ_Y_Q2_K_PASCAL 64
3570
+ #define NWARPS_Q2_K_PASCAL 8
3571
+
3572
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3573
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3574
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3575
+
3576
+ #if __CUDA_ARCH__ >= CC_TURING
3577
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3578
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3579
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3580
+
3581
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3582
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3583
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3584
+
3585
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3586
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3587
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3588
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3589
+
3590
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3591
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3592
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3593
+ #else
3594
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3595
+ assert(false);
3596
+ #endif // __CUDA_ARCH__ >= CC_TURING
3597
+ }
3598
+
3599
+ #define MMQ_X_Q3_K_AMPERE 128
3600
+ #define MMQ_Y_Q3_K_AMPERE 128
3601
+ #define NWARPS_Q3_K_AMPERE 4
3602
+ #define MMQ_X_Q3_K_PASCAL 64
3603
+ #define MMQ_Y_Q3_K_PASCAL 64
3604
+ #define NWARPS_Q3_K_PASCAL 8
3605
+
3606
+ template <bool need_check> static __global__ void
3607
+ #if __CUDA_ARCH__ < CC_TURING
3608
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3609
+ #endif // __CUDA_ARCH__ < CC_TURING
3610
+ mul_mat_q3_K(
3611
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3612
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3613
+
3614
+ #if __CUDA_ARCH__ >= CC_TURING
3615
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3616
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3617
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3618
+
3619
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3620
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3621
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3622
+
3623
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3624
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3625
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3626
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3627
+
3628
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3629
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3631
+ #else
3632
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3633
+ assert(false);
3634
+ #endif // __CUDA_ARCH__ >= CC_TURING
3635
+ }
3636
+
3637
+ #define MMQ_X_Q4_K_AMPERE 64
3638
+ #define MMQ_Y_Q4_K_AMPERE 128
3639
+ #define NWARPS_Q4_K_AMPERE 4
3640
+ #define MMQ_X_Q4_K_PASCAL 64
3641
+ #define MMQ_Y_Q4_K_PASCAL 64
3642
+ #define NWARPS_Q4_K_PASCAL 8
3643
+
3644
+ template <bool need_check> static __global__ void
3645
+ #if __CUDA_ARCH__ < CC_TURING
3646
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3647
+ #endif // __CUDA_ARCH__ < CC_TURING
3648
+ mul_mat_q4_K(
3649
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3650
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3651
+
3652
+ #if __CUDA_ARCH__ >= CC_TURING
3653
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3654
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3655
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3656
+
3657
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3658
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3659
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3660
+
3661
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3662
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3663
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3664
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3665
+
3666
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3667
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3668
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3669
+ #else
3670
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3671
+ assert(false);
3672
+ #endif // __CUDA_ARCH__ >= CC_TURING
3673
+ }
3674
+
3675
+ #define MMQ_X_Q5_K_AMPERE 64
3676
+ #define MMQ_Y_Q5_K_AMPERE 128
3677
+ #define NWARPS_Q5_K_AMPERE 4
3678
+ #define MMQ_X_Q5_K_PASCAL 64
3679
+ #define MMQ_Y_Q5_K_PASCAL 64
3680
+ #define NWARPS_Q5_K_PASCAL 8
3681
+
3682
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3683
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3684
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3685
+
3686
+ #if __CUDA_ARCH__ >= CC_TURING
3687
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3688
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3689
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3690
+
3691
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3692
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3693
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3694
+
3695
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3696
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3697
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3698
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3699
+
3700
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3701
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3702
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3703
+ #else
3704
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3705
+ assert(false);
3706
+ #endif // __CUDA_ARCH__ >= CC_TURING
3707
+ }
3708
+
3709
+ #define MMQ_X_Q6_K_AMPERE 64
3710
+ #define MMQ_Y_Q6_K_AMPERE 64
3711
+ #define NWARPS_Q6_K_AMPERE 4
3712
+ #define MMQ_X_Q6_K_PASCAL 64
3713
+ #define MMQ_Y_Q6_K_PASCAL 64
3714
+ #define NWARPS_Q6_K_PASCAL 8
3715
+
3716
+ template <bool need_check> static __global__ void
3717
+ #if __CUDA_ARCH__ < CC_TURING
3718
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3719
+ #endif // __CUDA_ARCH__ < CC_TURING
3720
+ mul_mat_q6_K(
3721
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3722
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3723
+
3724
+ #if __CUDA_ARCH__ >= CC_TURING
3725
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3726
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3727
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3728
+
3729
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3730
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3731
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3732
+
3733
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3734
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3735
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3736
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3737
+
3738
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3739
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3740
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3741
+ #else
3742
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3743
+ assert(false);
3744
+ #endif // __CUDA_ARCH__ >= CC_TURING
3745
+ }
3746
+
3244
3747
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3245
3748
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3246
3749
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3485,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3485
3988
  // rope == RoPE == rotary positional embedding
3486
3989
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3487
3990
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3488
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3991
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3489
3992
 
3490
3993
  if (col >= ncols) {
3491
3994
  return;
3492
3995
  }
3493
3996
 
3494
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3997
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3495
3998
  const int i = row*ncols + col;
3496
3999
 
3497
4000
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3505,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3505
4008
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3506
4009
  }
3507
4010
 
4011
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4012
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4013
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4014
+
4015
+ if (col >= ncols) {
4016
+ return;
4017
+ }
4018
+
4019
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4020
+ const int i = row*ncols + col/2;
4021
+
4022
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4023
+ const float sin_theta = sinf(theta);
4024
+ const float cos_theta = cosf(theta);
4025
+
4026
+ const float x0 = x[i + 0];
4027
+ const float x1 = x[i + ncols/2];
4028
+
4029
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4030
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4031
+ }
4032
+
3508
4033
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3509
4034
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3510
4035
  const int half_n_dims = ncols/4;
@@ -3539,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3539
4064
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3540
4065
  }
3541
4066
 
3542
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4067
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4068
+ const int n_heads_log2_floor, const float m0, const float m1) {
3543
4069
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4070
+
4071
+ if (col >= ncols) {
4072
+ return;
4073
+ }
4074
+
3544
4075
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4076
+ const int i = row*ncols + col;
4077
+
4078
+ const int k = row/k_rows;
4079
+
4080
+ float m_k;
4081
+ if (k < n_heads_log2_floor) {
4082
+ m_k = powf(m0, k + 1);
4083
+ } else {
4084
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4085
+ }
4086
+
4087
+ dst[i] = col * m_k + x[i];
4088
+ }
4089
+
4090
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4091
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4092
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3545
4093
 
3546
4094
  if (col >= ncols) {
3547
4095
  return;
@@ -3554,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3554
4102
 
3555
4103
  // the CUDA soft max implementation differs from the CPU implementation
3556
4104
  // instead of doubles floats are used
3557
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3558
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3559
4105
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3560
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3561
- const int block_size = blockDim.x;
3562
- const int tid = threadIdx.x;
4106
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4107
+ const int block_size = blockDim.y;
4108
+ const int tid = threadIdx.y;
3563
4109
 
3564
- float tmp = 0.0;
4110
+ float max_val = -INFINITY;
3565
4111
 
3566
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3567
- const int col = block_start + tid;
4112
+ for (int col = tid; col < ncols; col += block_size) {
4113
+ const int i = row*ncols + col;
4114
+ max_val = max(max_val, x[i]);
4115
+ }
3568
4116
 
3569
- if (col >= ncols) {
3570
- break;
3571
- }
4117
+ // find the max value in the block
4118
+ #pragma unroll
4119
+ for (int mask = 16; mask > 0; mask >>= 1) {
4120
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4121
+ }
3572
4122
 
4123
+ float tmp = 0.f;
4124
+
4125
+ for (int col = tid; col < ncols; col += block_size) {
3573
4126
  const int i = row*ncols + col;
3574
- const float val = expf(x[i]);
4127
+ const float val = expf(x[i] - max_val);
3575
4128
  tmp += val;
3576
4129
  dst[i] = val;
3577
4130
  }
@@ -3582,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3582
4135
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3583
4136
  }
3584
4137
 
3585
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3586
- const int col = block_start + tid;
3587
-
3588
- if (col >= ncols) {
3589
- break;
3590
- }
4138
+ const float inv_tmp = 1.f / tmp;
3591
4139
 
4140
+ for (int col = tid; col < ncols; col += block_size) {
3592
4141
  const int i = row*ncols + col;
3593
- dst[i] /= tmp;
4142
+ dst[i] *= inv_tmp;
3594
4143
  }
3595
4144
  }
3596
4145
 
@@ -3942,48 +4491,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3942
4491
  CUDA_CHECK(cudaGetDevice(&id));
3943
4492
  const int compute_capability = g_compute_capabilities[id];
3944
4493
 
4494
+ int mmq_x, mmq_y, nwarps;
3945
4495
  if (compute_capability >= CC_TURING) {
3946
- const int mmq_x = 64;
3947
- const int mmq_y = 128;
3948
- const int nwarps = 4;
3949
-
3950
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
- const dim3 block_nums(block_num_x, block_num_y, 1);
3953
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
-
3955
- if (nrows_x % mmq_y == 0) {
3956
- const bool need_check = false;
3957
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
- } else {
3961
- const bool need_check = true;
3962
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
- }
4496
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4498
+ nwarps = NWARPS_Q4_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4502
+ nwarps = NWARPS_Q4_0_PASCAL;
3966
4503
  } else {
3967
- const int mmq_x = 64;
3968
- const int mmq_y = 64;
3969
- const int nwarps = 4;
3970
-
3971
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
- const dim3 block_nums(block_num_x, block_num_y, 1);
3974
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
-
3976
- if (nrows_x % mmq_y == 0) {
3977
- const bool need_check = false;
3978
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
- } else {
3982
- const bool need_check = true;
3983
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
- }
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4509
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4511
+
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4516
+ } else {
4517
+ const bool need_check = true;
4518
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3987
4520
  }
3988
4521
  }
3989
4522
 
@@ -3995,49 +4528,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3995
4528
  CUDA_CHECK(cudaGetDevice(&id));
3996
4529
  const int compute_capability = g_compute_capabilities[id];
3997
4530
 
4531
+ int mmq_x, mmq_y, nwarps;
3998
4532
  if (compute_capability >= CC_TURING) {
3999
- const int mmq_x = 64;
4000
- const int mmq_y = 128;
4001
- const int nwarps = 4;
4002
-
4003
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
- const dim3 block_nums(block_num_x, block_num_y, 1);
4006
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
-
4008
- if (nrows_x % mmq_y == 0) {
4009
- const bool need_check = false;
4010
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
- } else {
4014
- const bool need_check = true;
4015
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
- }
4533
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4534
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4535
+ nwarps = NWARPS_Q4_1_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4538
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4539
+ nwarps = NWARPS_Q4_1_PASCAL;
4019
4540
  } else {
4020
- const int mmq_x = 64;
4021
- const int mmq_y = 64;
4022
- const int nwarps = 8;
4023
-
4024
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
- const dim3 block_nums(block_num_x, block_num_y, 1);
4027
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
-
4029
- if (nrows_x % mmq_y == 0) {
4030
- const bool need_check = false;
4031
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
- } else {
4035
- const bool need_check = true;
4036
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
- }
4541
+ GGML_ASSERT(false);
4542
+ }
4040
4543
 
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4546
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4548
+
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4553
+ } else {
4554
+ const bool need_check = true;
4555
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4041
4557
  }
4042
4558
  }
4043
4559
 
@@ -4049,48 +4565,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4049
4565
  CUDA_CHECK(cudaGetDevice(&id));
4050
4566
  const int compute_capability = g_compute_capabilities[id];
4051
4567
 
4568
+ int mmq_x, mmq_y, nwarps;
4052
4569
  if (compute_capability >= CC_TURING) {
4053
- const int mmq_x = 128;
4054
- const int mmq_y = 64;
4055
- const int nwarps = 4;
4056
-
4057
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
- const dim3 block_nums(block_num_x, block_num_y, 1);
4060
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
-
4062
- if (nrows_x % mmq_y == 0) {
4063
- const bool need_check = false;
4064
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
- } else {
4068
- const bool need_check = true;
4069
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
- }
4570
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4571
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4572
+ nwarps = NWARPS_Q5_0_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4575
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4576
+ nwarps = NWARPS_Q5_0_PASCAL;
4073
4577
  } else {
4074
- const int mmq_x = 64;
4075
- const int mmq_y = 64;
4076
- const int nwarps = 8;
4077
-
4078
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
- const dim3 block_nums(block_num_x, block_num_y, 1);
4081
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
-
4083
- if (nrows_x % mmq_y == 0) {
4084
- const bool need_check = false;
4085
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
- } else {
4089
- const bool need_check = true;
4090
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
- }
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4583
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4585
+
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4590
+ } else {
4591
+ const bool need_check = true;
4592
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4094
4594
  }
4095
4595
  }
4096
4596
 
@@ -4102,48 +4602,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4102
4602
  CUDA_CHECK(cudaGetDevice(&id));
4103
4603
  const int compute_capability = g_compute_capabilities[id];
4104
4604
 
4605
+ int mmq_x, mmq_y, nwarps;
4105
4606
  if (compute_capability >= CC_TURING) {
4106
- const int mmq_x = 128;
4107
- const int mmq_y = 64;
4108
- const int nwarps = 8;
4109
-
4110
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
- const dim3 block_nums(block_num_x, block_num_y, 1);
4113
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
-
4115
- if (nrows_x % mmq_y == 0) {
4116
- const bool need_check = false;
4117
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
- } else {
4121
- const bool need_check = true;
4122
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
- }
4607
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4608
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4609
+ nwarps = NWARPS_Q5_1_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4612
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4613
+ nwarps = NWARPS_Q5_1_PASCAL;
4126
4614
  } else {
4127
- const int mmq_x = 64;
4128
- const int mmq_y = 64;
4129
- const int nwarps = 8;
4130
-
4131
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
- const dim3 block_nums(block_num_x, block_num_y, 1);
4134
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
-
4136
- if (nrows_x % mmq_y == 0) {
4137
- const bool need_check = false;
4138
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
- } else {
4142
- const bool need_check = true;
4143
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
- }
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4620
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4622
+
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4627
+ } else {
4628
+ const bool need_check = true;
4629
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4147
4631
  }
4148
4632
  }
4149
4633
 
@@ -4155,48 +4639,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4155
4639
  CUDA_CHECK(cudaGetDevice(&id));
4156
4640
  const int compute_capability = g_compute_capabilities[id];
4157
4641
 
4642
+ int mmq_x, mmq_y, nwarps;
4158
4643
  if (compute_capability >= CC_TURING) {
4159
- const int mmq_x = 128;
4160
- const int mmq_y = 64;
4161
- const int nwarps = 4;
4162
-
4163
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
- const dim3 block_nums(block_num_x, block_num_y, 1);
4166
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
-
4168
- if (nrows_x % mmq_y == 0) {
4169
- const bool need_check = false;
4170
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
- } else {
4174
- const bool need_check = true;
4175
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
- }
4644
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4645
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4646
+ nwarps = NWARPS_Q8_0_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4649
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4650
+ nwarps = NWARPS_Q8_0_PASCAL;
4179
4651
  } else {
4180
- const int mmq_x = 64;
4181
- const int mmq_y = 64;
4182
- const int nwarps = 8;
4183
-
4184
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
- const dim3 block_nums(block_num_x, block_num_y, 1);
4187
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
-
4189
- if (nrows_x % mmq_y == 0) {
4190
- const bool need_check = false;
4191
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
- } else {
4195
- const bool need_check = true;
4196
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
- }
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4657
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4659
+
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4664
+ } else {
4665
+ const bool need_check = true;
4666
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4200
4668
  }
4201
4669
  }
4202
4670
 
@@ -4208,48 +4676,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4208
4676
  CUDA_CHECK(cudaGetDevice(&id));
4209
4677
  const int compute_capability = g_compute_capabilities[id];
4210
4678
 
4679
+ int mmq_x, mmq_y, nwarps;
4211
4680
  if (compute_capability >= CC_TURING) {
4212
- const int mmq_x = 64;
4213
- const int mmq_y = 128;
4214
- const int nwarps = 4;
4215
-
4216
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
- const dim3 block_nums(block_num_x, block_num_y, 1);
4219
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
-
4221
- if (nrows_x % mmq_y == 0) {
4222
- const bool need_check = false;
4223
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
- } else {
4227
- const bool need_check = true;
4228
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
- }
4681
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4683
+ nwarps = NWARPS_Q2_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4687
+ nwarps = NWARPS_Q2_K_PASCAL;
4232
4688
  } else {
4233
- const int mmq_x = 64;
4234
- const int mmq_y = 64;
4235
- const int nwarps = 8;
4236
-
4237
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
- const dim3 block_nums(block_num_x, block_num_y, 1);
4240
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
-
4242
- if (nrows_x % mmq_y == 0) {
4243
- const bool need_check = false;
4244
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
- } else {
4248
- const bool need_check = true;
4249
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
- }
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4694
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4696
+
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4701
+ } else {
4702
+ const bool need_check = true;
4703
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4253
4705
  }
4254
4706
  }
4255
4707
 
@@ -4261,48 +4713,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4261
4713
  CUDA_CHECK(cudaGetDevice(&id));
4262
4714
  const int compute_capability = g_compute_capabilities[id];
4263
4715
 
4716
+ int mmq_x, mmq_y, nwarps;
4264
4717
  if (compute_capability >= CC_TURING) {
4265
- const int mmq_x = 128;
4266
- const int mmq_y = 128;
4267
- const int nwarps = 4;
4268
-
4269
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
- const dim3 block_nums(block_num_x, block_num_y, 1);
4272
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
-
4274
- if (nrows_x % mmq_y == 0) {
4275
- const bool need_check = false;
4276
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
- } else {
4280
- const bool need_check = true;
4281
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
- }
4718
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4719
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4720
+ nwarps = NWARPS_Q3_K_AMPERE;
4721
+ } else if (compute_capability >= MIN_CC_DP4A) {
4722
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4723
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4724
+ nwarps = NWARPS_Q3_K_PASCAL;
4285
4725
  } else {
4286
- const int mmq_x = 64;
4287
- const int mmq_y = 64;
4288
- const int nwarps = 8;
4289
-
4290
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
- const dim3 block_nums(block_num_x, block_num_y, 1);
4293
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
-
4295
- if (nrows_x % mmq_y == 0) {
4296
- const bool need_check = false;
4297
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
- } else {
4301
- const bool need_check = true;
4302
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
- }
4726
+ GGML_ASSERT(false);
4727
+ }
4728
+
4729
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4730
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4731
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4732
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4733
+
4734
+ if (nrows_x % mmq_y == 0) {
4735
+ const bool need_check = false;
4736
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4737
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4738
+ } else {
4739
+ const bool need_check = true;
4740
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4741
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4306
4742
  }
4307
4743
  }
4308
4744
 
@@ -4314,48 +4750,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4314
4750
  CUDA_CHECK(cudaGetDevice(&id));
4315
4751
  const int compute_capability = g_compute_capabilities[id];
4316
4752
 
4753
+ int mmq_x, mmq_y, nwarps;
4317
4754
  if (compute_capability >= CC_TURING) {
4318
- const int mmq_x = 64;
4319
- const int mmq_y = 128;
4320
- const int nwarps = 4;
4321
-
4322
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
- const dim3 block_nums(block_num_x, block_num_y, 1);
4325
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
-
4327
- if (nrows_x % mmq_y == 0) {
4328
- const bool need_check = false;
4329
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
- } else {
4333
- const bool need_check = true;
4334
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
- }
4755
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4756
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4757
+ nwarps = NWARPS_Q4_K_AMPERE;
4758
+ } else if (compute_capability >= MIN_CC_DP4A) {
4759
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4760
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4761
+ nwarps = NWARPS_Q4_K_PASCAL;
4338
4762
  } else {
4339
- const int mmq_x = 32;
4340
- const int mmq_y = 64;
4341
- const int nwarps = 8;
4342
-
4343
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
- const dim3 block_nums(block_num_x, block_num_y, 1);
4346
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
-
4348
- if (nrows_x % mmq_y == 0) {
4349
- const bool need_check = false;
4350
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
- } else {
4354
- const bool need_check = true;
4355
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
- }
4763
+ GGML_ASSERT(false);
4764
+ }
4765
+
4766
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4767
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4768
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4769
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4770
+
4771
+ if (nrows_x % mmq_y == 0) {
4772
+ const bool need_check = false;
4773
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4774
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4775
+ } else {
4776
+ const bool need_check = true;
4777
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4778
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4779
  }
4360
4780
  }
4361
4781
 
@@ -4367,48 +4787,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4367
4787
  CUDA_CHECK(cudaGetDevice(&id));
4368
4788
  const int compute_capability = g_compute_capabilities[id];
4369
4789
 
4790
+ int mmq_x, mmq_y, nwarps;
4370
4791
  if (compute_capability >= CC_TURING) {
4371
- const int mmq_x = 64;
4372
- const int mmq_y = 128;
4373
- const int nwarps = 4;
4374
-
4375
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
- const dim3 block_nums(block_num_x, block_num_y, 1);
4378
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
-
4380
- if (nrows_x % mmq_y == 0) {
4381
- const bool need_check = false;
4382
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
- } else {
4386
- const bool need_check = true;
4387
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
- }
4792
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4793
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4794
+ nwarps = NWARPS_Q5_K_AMPERE;
4795
+ } else if (compute_capability >= MIN_CC_DP4A) {
4796
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4797
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4798
+ nwarps = NWARPS_Q5_K_PASCAL;
4391
4799
  } else {
4392
- const int mmq_x = 64;
4393
- const int mmq_y = 64;
4394
- const int nwarps = 8;
4395
-
4396
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
- const dim3 block_nums(block_num_x, block_num_y, 1);
4399
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
-
4401
- if (nrows_x % mmq_y == 0) {
4402
- const bool need_check = false;
4403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
- } else {
4407
- const bool need_check = true;
4408
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
- }
4800
+ GGML_ASSERT(false);
4801
+ }
4802
+
4803
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4804
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4805
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4806
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4807
+
4808
+ if (nrows_x % mmq_y == 0) {
4809
+ const bool need_check = false;
4810
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4811
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4812
+ } else {
4813
+ const bool need_check = true;
4814
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4815
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4412
4816
  }
4413
4817
  }
4414
4818
 
@@ -4420,48 +4824,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4420
4824
  CUDA_CHECK(cudaGetDevice(&id));
4421
4825
  const int compute_capability = g_compute_capabilities[id];
4422
4826
 
4827
+ int mmq_x, mmq_y, nwarps;
4423
4828
  if (compute_capability >= CC_TURING) {
4424
- const int mmq_x = 64;
4425
- const int mmq_y = 64;
4426
- const int nwarps = 4;
4427
-
4428
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
- const dim3 block_nums(block_num_x, block_num_y, 1);
4431
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
-
4433
- if (nrows_x % mmq_y == 0) {
4434
- const bool need_check = false;
4435
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
- } else {
4439
- const bool need_check = true;
4440
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
- }
4829
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4830
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4831
+ nwarps = NWARPS_Q6_K_AMPERE;
4832
+ } else if (compute_capability >= MIN_CC_DP4A) {
4833
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4834
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4835
+ nwarps = NWARPS_Q6_K_PASCAL;
4444
4836
  } else {
4445
- const int mmq_x = 32;
4446
- const int mmq_y = 64;
4447
- const int nwarps = 8;
4448
-
4449
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
- const dim3 block_nums(block_num_x, block_num_y, 1);
4452
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
-
4454
- if (nrows_x % mmq_y == 0) {
4455
- const bool need_check = false;
4456
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
- } else {
4460
- const bool need_check = true;
4461
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
- }
4837
+ GGML_ASSERT(false);
4838
+ }
4839
+
4840
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4841
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4842
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4843
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4844
+
4845
+ if (nrows_x % mmq_y == 0) {
4846
+ const bool need_check = false;
4847
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4848
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4849
+ } else {
4850
+ const bool need_check = true;
4851
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4852
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4465
4853
  }
4466
4854
  }
4467
4855
 
@@ -4511,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4511
4899
 
4512
4900
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4513
4901
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4514
- GGML_ASSERT(nrows % 2 == 0);
4515
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4902
+ GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4516
4904
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4517
- const dim3 block_nums(num_blocks_x, nrows, 1);
4905
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4518
4906
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4519
4907
  }
4520
4908
 
4909
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4912
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4914
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4915
+ }
4916
+
4521
4917
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4522
4918
  GGML_ASSERT(nrows % 4 == 0);
4523
4919
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4526,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4526
4922
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4527
4923
  }
4528
4924
 
4925
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4926
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4927
+ const float m1, cudaStream_t stream) {
4928
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4929
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4930
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4931
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4932
+ }
4933
+
4529
4934
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4530
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4935
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4531
4936
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4532
- const dim3 block_nums(block_num_x, nrows_x, 1);
4937
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4533
4938
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4534
4939
  }
4535
4940
 
4536
4941
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4537
- const dim3 block_dims(WARP_SIZE, 1, 1);
4538
- const dim3 block_nums(1, nrows_x, 1);
4942
+ const dim3 block_dims(1, WARP_SIZE, 1);
4943
+ const dim3 block_nums(nrows_x, 1, 1);
4539
4944
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4540
4945
  }
4541
4946
 
@@ -4640,10 +5045,18 @@ void ggml_init_cublas() {
4640
5045
  static bool initialized = false;
4641
5046
 
4642
5047
  if (!initialized) {
5048
+
5049
+ #ifdef __HIP_PLATFORM_AMD__
5050
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5051
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5052
+ rocblas_initialize();
5053
+ CUDA_CHECK(cudaDeviceSynchronize());
5054
+ #endif
5055
+
4643
5056
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4644
5057
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4645
5058
  int64_t total_vram = 0;
4646
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5059
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4647
5060
  for (int id = 0; id < g_device_count; ++id) {
4648
5061
  cudaDeviceProp prop;
4649
5062
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5241,7 +5654,8 @@ inline void ggml_cuda_op_rope(
5241
5654
 
5242
5655
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5243
5656
 
5244
- const bool is_glm = mode & 4;
5657
+ const bool is_neox = mode & 2;
5658
+ const bool is_glm = mode & 4;
5245
5659
 
5246
5660
  // compute
5247
5661
  if (is_glm) {
@@ -5249,6 +5663,10 @@ inline void ggml_cuda_op_rope(
5249
5663
  const float id_p = min(p, n_ctx - 2.f);
5250
5664
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5251
5665
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5666
+ } else if (is_neox) {
5667
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5668
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5669
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5252
5670
  } else {
5253
5671
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5254
5672
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5261,6 +5679,41 @@ inline void ggml_cuda_op_rope(
5261
5679
  (void) i1;
5262
5680
  }
5263
5681
 
5682
+ inline void ggml_cuda_op_alibi(
5683
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5684
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5685
+ cudaStream_t & cudaStream_main){
5686
+
5687
+ GGML_ASSERT(src0_ddf_i != nullptr);
5688
+ GGML_ASSERT(dst_ddf_i != nullptr);
5689
+
5690
+ const int64_t ne00 = src0->ne[0];
5691
+ const int64_t ne01 = src0->ne[1];
5692
+ const int64_t ne02 = src0->ne[2];
5693
+ const int64_t i01_diff = i01_high - i01_low;
5694
+
5695
+ const int n_past = ((int32_t *) dst->op_params)[0];
5696
+ const int n_head = ((int32_t *) dst->op_params)[1];
5697
+ float max_bias;
5698
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5699
+
5700
+ GGML_ASSERT(ne01 + n_past == ne00);
5701
+ GGML_ASSERT(n_head == ne02);
5702
+
5703
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5704
+
5705
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5706
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5707
+
5708
+ // compute
5709
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5710
+
5711
+ (void) src1;
5712
+ (void) src0_ddq_i;
5713
+ (void) src1_ddf_i;
5714
+ (void) i1;
5715
+ }
5716
+
5264
5717
  inline void ggml_cuda_op_diag_mask_inf(
5265
5718
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5266
5719
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -5881,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
5881
6334
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
5882
6335
  }
5883
6336
 
6337
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6338
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6339
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6340
+ }
6341
+
5884
6342
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5885
6343
  (void) src0;
5886
6344
  (void) src1;
@@ -6000,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6000
6458
  return extra;
6001
6459
  }
6002
6460
 
6003
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6461
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6004
6462
  if (scratch && g_scratch_size == 0) {
6005
6463
  return;
6006
6464
  }
@@ -6009,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6009
6467
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6010
6468
  const ggml_op src0_op = tensor->src[0]->op;
6011
6469
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6012
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6470
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6013
6471
  }
6014
6472
  }
6015
6473
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6016
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6474
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6017
6475
  }
6018
6476
 
6019
6477
  tensor->backend = GGML_BACKEND_GPU;
6478
+
6479
+ if (scratch && no_alloc) {
6480
+ return;
6481
+ }
6482
+
6020
6483
  struct ggml_tensor_extra_gpu * extra;
6021
6484
 
6022
6485
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6068,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6068
6531
  tensor->extra = extra;
6069
6532
  }
6070
6533
 
6534
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6535
+ if (g_scratch_size == 0) {
6536
+ return;
6537
+ }
6538
+ if (g_scratch_buffer == nullptr) {
6539
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6540
+ }
6541
+
6542
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6543
+
6544
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6545
+ tensor->op == GGML_OP_VIEW;
6546
+
6547
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6548
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6549
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6550
+ size_t view_offset = 0;
6551
+ if (tensor->op == GGML_OP_VIEW) {
6552
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6553
+ }
6554
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6555
+ } else {
6556
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6557
+ }
6558
+
6559
+ tensor->extra = extra;
6560
+ }
6561
+
6071
6562
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6072
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6563
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6564
+ }
6565
+
6566
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6567
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6073
6568
  }
6074
6569
 
6075
6570
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6076
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6571
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6077
6572
  }
6078
6573
 
6079
6574
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6080
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6575
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6081
6576
  }
6082
6577
 
6083
6578
  void ggml_cuda_set_main_device(int main_device) {
@@ -6216,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6216
6711
  }
6217
6712
  func = ggml_cuda_rope;
6218
6713
  break;
6714
+ case GGML_OP_ALIBI:
6715
+ if (!any_on_device) {
6716
+ return false;
6717
+ }
6718
+ func = ggml_cuda_alibi;
6719
+ break;
6219
6720
  default:
6220
6721
  return false;
6221
6722
  }
@@ -6229,3 +6730,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6229
6730
  func(tensor->src[0], tensor->src[1], tensor);
6230
6731
  return true;
6231
6732
  }
6733
+
6734
+ int ggml_cuda_get_device_count() {
6735
+ int device_count;
6736
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6737
+ return device_count;
6738
+ }
6739
+
6740
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6741
+ cudaDeviceProp prop;
6742
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6743
+ snprintf(description, description_size, "%s", prop.name);
6744
+ }