llama_cpp 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1399,6 +1399,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1399
1399
  // second part effectively subtracts 8 from each quant value
1400
1400
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1401
1401
  #else
1402
+ assert(false);
1402
1403
  return 0.0f; // only to satisfy the compiler
1403
1404
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1404
1405
  }
@@ -1436,6 +1437,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1436
1437
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
1438
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1438
1439
  #else
1440
+ assert(false);
1439
1441
  return 0.0f; // only to satisfy the compiler
1440
1442
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1441
1443
  }
@@ -1471,6 +1473,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1471
1473
  // second part effectively subtracts 16 from each quant value
1472
1474
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1473
1475
  #else
1476
+ assert(false);
1474
1477
  return 0.0f; // only to satisfy the compiler
1475
1478
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1476
1479
  }
@@ -1516,6 +1519,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1516
1519
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1517
1520
 
1518
1521
  #else
1522
+ assert(false);
1519
1523
  return 0.0f; // only to satisfy the compiler
1520
1524
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1521
1525
  }
@@ -1537,6 +1541,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
1537
1541
 
1538
1542
  return d8_0*d8_1 * sumi;
1539
1543
  #else
1544
+ assert(false);
1540
1545
  return 0.0f; // only to satisfy the compiler
1541
1546
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1542
1547
  }
@@ -1567,6 +1572,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1567
1572
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
1573
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
1574
  #else
1575
+ assert(false);
1570
1576
  return 0.0f; // only to satisfy the compiler
1571
1577
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
1578
  }
@@ -1602,6 +1608,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1602
1608
 
1603
1609
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
1610
  #else
1611
+ assert(false);
1605
1612
  return 0.0f; // only to satisfy the compiler
1606
1613
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
1614
  }
@@ -1639,6 +1646,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1639
1646
 
1640
1647
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
1648
  #else
1649
+ assert(false);
1642
1650
  return 0.0f; // only to satisfy the compiler
1643
1651
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
1652
  }
@@ -1679,6 +1687,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1679
1687
 
1680
1688
  return d3 * sumf;
1681
1689
  #else
1690
+ assert(false);
1682
1691
  return 0.0f; // only to satisfy the compiler
1683
1692
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
1693
  }
@@ -1704,6 +1713,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1704
1713
 
1705
1714
  return d3*d8 * sumi;
1706
1715
  #else
1716
+ assert(false);
1707
1717
  return 0.0f; // only to satisfy the compiler
1708
1718
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
1719
  }
@@ -1737,12 +1747,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1737
1747
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
1748
 
1739
1749
  #else
1750
+ assert(false);
1740
1751
  return 0.0f; // only to satisfy the compiler
1741
1752
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
1753
  }
1743
1754
 
1744
1755
  // contiguous u/y values
1745
- // also used for q5_K
1746
1756
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
1757
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
1758
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
@@ -1752,19 +1762,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1752
1762
  float sumf_m = 0.0f;
1753
1763
 
1754
1764
  #pragma unroll
1755
- for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1765
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1756
1766
  int sumi_d = 0;
1757
1767
 
1758
1768
  #pragma unroll
1759
- for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
- sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
- sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1769
+ for (int j = 0; j < QI8_1; ++j) {
1770
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1762
1771
  }
1763
1772
 
1764
- const float2 ds8f = __half22float2(ds8[i0 / 4]);
1773
+ const float2 ds8f = __half22float2(ds8[i]);
1765
1774
 
1766
- sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
- sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1775
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1776
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1768
1777
  }
1769
1778
 
1770
1779
  const float2 dm4f = __half22float2(dm4);
@@ -1772,6 +1781,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1772
1781
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
1782
 
1774
1783
  #else
1784
+ assert(false);
1775
1785
  return 0.0f; // only to satisfy the compiler
1776
1786
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
1787
  }
@@ -1780,7 +1790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1780
1790
  #define VDR_Q5_K_Q8_1_MMQ 8
1781
1791
 
1782
1792
  // contiguous v/x values
1783
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1793
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1784
1794
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
1795
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
1796
 
@@ -1812,6 +1822,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1812
1822
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
1823
 
1814
1824
  #else
1825
+ assert(false);
1826
+ return 0.0f; // only to satisfy the compiler
1827
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1828
+ }
1829
+
1830
+ // contiguous u/y values
1831
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1832
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1833
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1834
+
1835
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1836
+ float sumf_d = 0.0f;
1837
+ float sumf_m = 0.0f;
1838
+
1839
+ #pragma unroll
1840
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1841
+ int sumi_d = 0;
1842
+
1843
+ #pragma unroll
1844
+ for (int j = 0; j < QI8_1; ++j) {
1845
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1846
+ }
1847
+
1848
+ const float2 ds8f = __half22float2(ds8[i]);
1849
+
1850
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1851
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1852
+ }
1853
+
1854
+ const float2 dm4f = __half22float2(dm4);
1855
+
1856
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1857
+
1858
+ #else
1859
+ assert(false);
1815
1860
  return 0.0f; // only to satisfy the compiler
1816
1861
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
1862
  }
@@ -1842,6 +1887,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1842
1887
 
1843
1888
  return d*sumf;
1844
1889
  #else
1890
+ assert(false);
1845
1891
  return 0.0f; // only to satisfy the compiler
1846
1892
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
1893
  }
@@ -1873,6 +1919,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1873
1919
  return d6 * sumf_d;
1874
1920
 
1875
1921
  #else
1922
+ assert(false);
1876
1923
  return 0.0f; // only to satisfy the compiler
1877
1924
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
1925
  }
@@ -2722,6 +2769,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2722
2769
  return dall * sumf_d - dmin * sumf_m;
2723
2770
 
2724
2771
  #else
2772
+ assert(false);
2725
2773
  return 0.0f; // only to satisfy the compiler
2726
2774
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
2775
 
@@ -2808,18 +2856,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
2856
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
2857
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
2858
 
2811
- int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
-
2813
- #pragma unroll
2814
- for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
- v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
- v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
- }
2818
-
2819
2859
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
2860
 
2821
2861
  const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
- return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
2864
  }
2824
2865
 
2825
2866
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2866,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2866
2907
  u[2*i+1] = q8[4];
2867
2908
  }
2868
2909
 
2869
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2910
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
2911
 
2871
2912
  #else
2872
2913
 
@@ -2905,6 +2946,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2905
2946
  return d * sumf_d;
2906
2947
 
2907
2948
  #else
2949
+ assert(false);
2908
2950
  return 0.0f; // only to satisfy the compiler
2909
2951
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
2952
 
@@ -3008,7 +3050,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3008
3050
 
3009
3051
  const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
3052
  const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3053
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3054
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
3055
  }
3013
3056
 
3014
3057
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -3135,7 +3178,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3135
3178
 
3136
3179
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
3180
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
- static __global__ void mul_mat_q(
3181
+ static __device__ __forceinline__ void mul_mat_q(
3139
3182
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
3183
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
3184
 
@@ -3150,7 +3193,6 @@ static __global__ void mul_mat_q(
3150
3193
 
3151
3194
  const int row_dst_0 = blockIdx.x*mmq_y;
3152
3195
  const int & row_x_0 = row_dst_0;
3153
- const int row_dst = row_dst_0 + threadIdx.x;
3154
3196
 
3155
3197
  const int col_dst_0 = blockIdx.y*mmq_x;
3156
3198
  const int & col_y_0 = col_dst_0;
@@ -3223,11 +3265,7 @@ static __global__ void mul_mat_q(
3223
3265
  }
3224
3266
  }
3225
3267
 
3226
-
3227
- if (row_dst >= nrows_dst) {
3228
- return;
3229
- }
3230
-
3268
+ #pragma unroll
3231
3269
  for (int j = 0; j < mmq_x; j += nwarps) {
3232
3270
  const int col_dst = col_dst_0 + j + threadIdx.y;
3233
3271
 
@@ -3235,12 +3273,375 @@ static __global__ void mul_mat_q(
3235
3273
  return;
3236
3274
  }
3237
3275
 
3276
+ #pragma unroll
3238
3277
  for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3278
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3279
+
3280
+ if (row_dst >= nrows_dst) {
3281
+ continue;
3282
+ }
3283
+
3284
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3240
3285
  }
3241
3286
  }
3242
3287
  }
3243
3288
 
3289
+ #define MMQ_X_Q4_0_AMPERE 64
3290
+ #define MMQ_Y_Q4_0_AMPERE 128
3291
+ #define NWARPS_Q4_0_AMPERE 4
3292
+ #define MMQ_X_Q4_0_PASCAL 64
3293
+ #define MMQ_Y_Q4_0_PASCAL 64
3294
+ #define NWARPS_Q4_0_PASCAL 8
3295
+
3296
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3297
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3298
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3299
+
3300
+ #if __CUDA_ARCH__ >= CC_TURING
3301
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3302
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3303
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3304
+
3305
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3306
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3307
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3308
+
3309
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3310
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3311
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3312
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3313
+
3314
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3315
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3316
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3317
+ #else
3318
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3319
+ assert(false);
3320
+ #endif // __CUDA_ARCH__ >= CC_TURING
3321
+ }
3322
+
3323
+ #define MMQ_X_Q4_1_AMPERE 64
3324
+ #define MMQ_Y_Q4_1_AMPERE 128
3325
+ #define NWARPS_Q4_1_AMPERE 4
3326
+ #define MMQ_X_Q4_1_PASCAL 64
3327
+ #define MMQ_Y_Q4_1_PASCAL 64
3328
+ #define NWARPS_Q4_1_PASCAL 8
3329
+
3330
+ template <bool need_check> static __global__ void
3331
+ #if __CUDA_ARCH__ < CC_TURING
3332
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3333
+ #endif // __CUDA_ARCH__ < CC_TURING
3334
+ mul_mat_q4_1(
3335
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3336
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3337
+
3338
+ #if __CUDA_ARCH__ >= CC_TURING
3339
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3340
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3341
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3342
+
3343
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3344
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3345
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3346
+
3347
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3348
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3349
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3350
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3351
+
3352
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3353
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3354
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3355
+ #else
3356
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3357
+ assert(false);
3358
+ #endif // __CUDA_ARCH__ >= CC_TURING
3359
+ }
3360
+
3361
+ #define MMQ_X_Q5_0_AMPERE 128
3362
+ #define MMQ_Y_Q5_0_AMPERE 64
3363
+ #define NWARPS_Q5_0_AMPERE 4
3364
+ #define MMQ_X_Q5_0_PASCAL 64
3365
+ #define MMQ_Y_Q5_0_PASCAL 64
3366
+ #define NWARPS_Q5_0_PASCAL 8
3367
+
3368
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3369
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3370
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3371
+
3372
+ #if __CUDA_ARCH__ >= CC_TURING
3373
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3374
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3375
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3376
+
3377
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3378
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3379
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3380
+
3381
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3382
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3383
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3384
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3385
+
3386
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3387
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3388
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3389
+ #else
3390
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3391
+ assert(false);
3392
+ #endif // __CUDA_ARCH__ >= CC_TURING
3393
+ }
3394
+
3395
+ #define MMQ_X_Q5_1_AMPERE 128
3396
+ #define MMQ_Y_Q5_1_AMPERE 64
3397
+ #define NWARPS_Q5_1_AMPERE 4
3398
+ #define MMQ_X_Q5_1_PASCAL 64
3399
+ #define MMQ_Y_Q5_1_PASCAL 64
3400
+ #define NWARPS_Q5_1_PASCAL 8
3401
+
3402
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3403
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3404
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3405
+
3406
+ #if __CUDA_ARCH__ >= CC_TURING
3407
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3408
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3409
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3410
+
3411
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3412
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3413
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3414
+
3415
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3416
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3417
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3418
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3419
+
3420
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3421
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3422
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3423
+ #else
3424
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3425
+ assert(false);
3426
+ #endif // __CUDA_ARCH__ >= CC_TURING
3427
+ }
3428
+
3429
+ #define MMQ_X_Q8_0_AMPERE 128
3430
+ #define MMQ_Y_Q8_0_AMPERE 64
3431
+ #define NWARPS_Q8_0_AMPERE 4
3432
+ #define MMQ_X_Q8_0_PASCAL 64
3433
+ #define MMQ_Y_Q8_0_PASCAL 64
3434
+ #define NWARPS_Q8_0_PASCAL 8
3435
+
3436
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3443
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3444
+
3445
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3446
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3452
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3453
+
3454
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3455
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q2_K_AMPERE 64
3464
+ #define MMQ_Y_Q2_K_AMPERE 128
3465
+ #define NWARPS_Q2_K_AMPERE 4
3466
+ #define MMQ_X_Q2_K_PASCAL 64
3467
+ #define MMQ_Y_Q2_K_PASCAL 64
3468
+ #define NWARPS_Q2_K_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3477
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3478
+
3479
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3480
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3486
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3487
+
3488
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3489
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q3_K_AMPERE 128
3498
+ #define MMQ_Y_Q3_K_AMPERE 128
3499
+ #define NWARPS_Q3_K_AMPERE 4
3500
+ #define MMQ_X_Q3_K_PASCAL 64
3501
+ #define MMQ_Y_Q3_K_PASCAL 64
3502
+ #define NWARPS_Q3_K_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void
3505
+ #if __CUDA_ARCH__ < CC_TURING
3506
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3507
+ #endif // __CUDA_ARCH__ < CC_TURING
3508
+ mul_mat_q3_K(
3509
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3510
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3511
+
3512
+ #if __CUDA_ARCH__ >= CC_TURING
3513
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3514
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3515
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3516
+
3517
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3518
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3520
+
3521
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3522
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3523
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3524
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3525
+
3526
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3527
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3528
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3529
+ #else
3530
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3531
+ assert(false);
3532
+ #endif // __CUDA_ARCH__ >= CC_TURING
3533
+ }
3534
+
3535
+ #define MMQ_X_Q4_K_AMPERE 64
3536
+ #define MMQ_Y_Q4_K_AMPERE 128
3537
+ #define NWARPS_Q4_K_AMPERE 4
3538
+ #define MMQ_X_Q4_K_PASCAL 64
3539
+ #define MMQ_Y_Q4_K_PASCAL 64
3540
+ #define NWARPS_Q4_K_PASCAL 8
3541
+
3542
+ template <bool need_check> static __global__ void
3543
+ #if __CUDA_ARCH__ < CC_TURING
3544
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3545
+ #endif // __CUDA_ARCH__ < CC_TURING
3546
+ mul_mat_q4_K(
3547
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3548
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3549
+
3550
+ #if __CUDA_ARCH__ >= CC_TURING
3551
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3552
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3553
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3554
+
3555
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3556
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3557
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3558
+
3559
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3560
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3561
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3562
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3563
+
3564
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3565
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3566
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3567
+ #else
3568
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3569
+ assert(false);
3570
+ #endif // __CUDA_ARCH__ >= CC_TURING
3571
+ }
3572
+
3573
+ #define MMQ_X_Q5_K_AMPERE 64
3574
+ #define MMQ_Y_Q5_K_AMPERE 128
3575
+ #define NWARPS_Q5_K_AMPERE 4
3576
+ #define MMQ_X_Q5_K_PASCAL 64
3577
+ #define MMQ_Y_Q5_K_PASCAL 64
3578
+ #define NWARPS_Q5_K_PASCAL 8
3579
+
3580
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3581
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3583
+
3584
+ #if __CUDA_ARCH__ >= CC_TURING
3585
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3586
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3587
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3588
+
3589
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3590
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3591
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3592
+
3593
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3594
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3595
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3596
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3597
+
3598
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3599
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3600
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3601
+ #else
3602
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3603
+ assert(false);
3604
+ #endif // __CUDA_ARCH__ >= CC_TURING
3605
+ }
3606
+
3607
+ #define MMQ_X_Q6_K_AMPERE 64
3608
+ #define MMQ_Y_Q6_K_AMPERE 64
3609
+ #define NWARPS_Q6_K_AMPERE 4
3610
+ #define MMQ_X_Q6_K_PASCAL 64
3611
+ #define MMQ_Y_Q6_K_PASCAL 64
3612
+ #define NWARPS_Q6_K_PASCAL 8
3613
+
3614
+ template <bool need_check> static __global__ void
3615
+ #if __CUDA_ARCH__ < CC_TURING
3616
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3617
+ #endif // __CUDA_ARCH__ < CC_TURING
3618
+ mul_mat_q6_K(
3619
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3620
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3621
+
3622
+ #if __CUDA_ARCH__ >= CC_TURING
3623
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3624
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3625
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3626
+
3627
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3628
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3629
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3630
+
3631
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3632
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3633
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3634
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3635
+
3636
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3637
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3638
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3639
+ #else
3640
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3641
+ assert(false);
3642
+ #endif // __CUDA_ARCH__ >= CC_TURING
3643
+ }
3644
+
3244
3645
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3245
3646
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3246
3647
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3942,48 +4343,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3942
4343
  CUDA_CHECK(cudaGetDevice(&id));
3943
4344
  const int compute_capability = g_compute_capabilities[id];
3944
4345
 
4346
+ int mmq_x, mmq_y, nwarps;
3945
4347
  if (compute_capability >= CC_TURING) {
3946
- const int mmq_x = 64;
3947
- const int mmq_y = 128;
3948
- const int nwarps = 4;
3949
-
3950
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
- const dim3 block_nums(block_num_x, block_num_y, 1);
3953
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
-
3955
- if (nrows_x % mmq_y == 0) {
3956
- const bool need_check = false;
3957
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
- } else {
3961
- const bool need_check = true;
3962
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
- }
4348
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4349
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4350
+ nwarps = NWARPS_Q4_0_AMPERE;
4351
+ } else if (compute_capability >= MIN_CC_DP4A) {
4352
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4353
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4354
+ nwarps = NWARPS_Q4_0_PASCAL;
3966
4355
  } else {
3967
- const int mmq_x = 64;
3968
- const int mmq_y = 64;
3969
- const int nwarps = 4;
3970
-
3971
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
- const dim3 block_nums(block_num_x, block_num_y, 1);
3974
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
-
3976
- if (nrows_x % mmq_y == 0) {
3977
- const bool need_check = false;
3978
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
- } else {
3982
- const bool need_check = true;
3983
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
- }
4356
+ GGML_ASSERT(false);
4357
+ }
4358
+
4359
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4360
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4361
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4362
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4363
+
4364
+ if (nrows_x % mmq_y == 0) {
4365
+ const bool need_check = false;
4366
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4367
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4368
+ } else {
4369
+ const bool need_check = true;
4370
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4371
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3987
4372
  }
3988
4373
  }
3989
4374
 
@@ -3995,49 +4380,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3995
4380
  CUDA_CHECK(cudaGetDevice(&id));
3996
4381
  const int compute_capability = g_compute_capabilities[id];
3997
4382
 
4383
+ int mmq_x, mmq_y, nwarps;
3998
4384
  if (compute_capability >= CC_TURING) {
3999
- const int mmq_x = 64;
4000
- const int mmq_y = 128;
4001
- const int nwarps = 4;
4002
-
4003
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
- const dim3 block_nums(block_num_x, block_num_y, 1);
4006
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
-
4008
- if (nrows_x % mmq_y == 0) {
4009
- const bool need_check = false;
4010
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
- } else {
4014
- const bool need_check = true;
4015
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
- }
4385
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4386
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4387
+ nwarps = NWARPS_Q4_1_AMPERE;
4388
+ } else if (compute_capability >= MIN_CC_DP4A) {
4389
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4390
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4391
+ nwarps = NWARPS_Q4_1_PASCAL;
4019
4392
  } else {
4020
- const int mmq_x = 64;
4021
- const int mmq_y = 64;
4022
- const int nwarps = 8;
4023
-
4024
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
- const dim3 block_nums(block_num_x, block_num_y, 1);
4027
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
-
4029
- if (nrows_x % mmq_y == 0) {
4030
- const bool need_check = false;
4031
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
- } else {
4035
- const bool need_check = true;
4036
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
- }
4393
+ GGML_ASSERT(false);
4394
+ }
4040
4395
 
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4404
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4405
+ } else {
4406
+ const bool need_check = true;
4407
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4408
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4041
4409
  }
4042
4410
  }
4043
4411
 
@@ -4049,48 +4417,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4049
4417
  CUDA_CHECK(cudaGetDevice(&id));
4050
4418
  const int compute_capability = g_compute_capabilities[id];
4051
4419
 
4420
+ int mmq_x, mmq_y, nwarps;
4052
4421
  if (compute_capability >= CC_TURING) {
4053
- const int mmq_x = 128;
4054
- const int mmq_y = 64;
4055
- const int nwarps = 4;
4056
-
4057
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
- const dim3 block_nums(block_num_x, block_num_y, 1);
4060
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
-
4062
- if (nrows_x % mmq_y == 0) {
4063
- const bool need_check = false;
4064
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
- } else {
4068
- const bool need_check = true;
4069
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
- }
4422
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4423
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4424
+ nwarps = NWARPS_Q5_0_AMPERE;
4425
+ } else if (compute_capability >= MIN_CC_DP4A) {
4426
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4427
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4428
+ nwarps = NWARPS_Q5_0_PASCAL;
4073
4429
  } else {
4074
- const int mmq_x = 64;
4075
- const int mmq_y = 64;
4076
- const int nwarps = 8;
4077
-
4078
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
- const dim3 block_nums(block_num_x, block_num_y, 1);
4081
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
-
4083
- if (nrows_x % mmq_y == 0) {
4084
- const bool need_check = false;
4085
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
- } else {
4089
- const bool need_check = true;
4090
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
- }
4430
+ GGML_ASSERT(false);
4431
+ }
4432
+
4433
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4434
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4435
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4436
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4437
+
4438
+ if (nrows_x % mmq_y == 0) {
4439
+ const bool need_check = false;
4440
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4441
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4442
+ } else {
4443
+ const bool need_check = true;
4444
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4445
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4094
4446
  }
4095
4447
  }
4096
4448
 
@@ -4102,48 +4454,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4102
4454
  CUDA_CHECK(cudaGetDevice(&id));
4103
4455
  const int compute_capability = g_compute_capabilities[id];
4104
4456
 
4457
+ int mmq_x, mmq_y, nwarps;
4105
4458
  if (compute_capability >= CC_TURING) {
4106
- const int mmq_x = 128;
4107
- const int mmq_y = 64;
4108
- const int nwarps = 8;
4109
-
4110
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
- const dim3 block_nums(block_num_x, block_num_y, 1);
4113
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
-
4115
- if (nrows_x % mmq_y == 0) {
4116
- const bool need_check = false;
4117
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
- } else {
4121
- const bool need_check = true;
4122
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
- }
4459
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4460
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4461
+ nwarps = NWARPS_Q5_1_AMPERE;
4462
+ } else if (compute_capability >= MIN_CC_DP4A) {
4463
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4464
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4465
+ nwarps = NWARPS_Q5_1_PASCAL;
4126
4466
  } else {
4127
- const int mmq_x = 64;
4128
- const int mmq_y = 64;
4129
- const int nwarps = 8;
4130
-
4131
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
- const dim3 block_nums(block_num_x, block_num_y, 1);
4134
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
-
4136
- if (nrows_x % mmq_y == 0) {
4137
- const bool need_check = false;
4138
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
- } else {
4142
- const bool need_check = true;
4143
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
- }
4467
+ GGML_ASSERT(false);
4468
+ }
4469
+
4470
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4471
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4472
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4473
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4474
+
4475
+ if (nrows_x % mmq_y == 0) {
4476
+ const bool need_check = false;
4477
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4478
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4479
+ } else {
4480
+ const bool need_check = true;
4481
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4482
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4147
4483
  }
4148
4484
  }
4149
4485
 
@@ -4155,48 +4491,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4155
4491
  CUDA_CHECK(cudaGetDevice(&id));
4156
4492
  const int compute_capability = g_compute_capabilities[id];
4157
4493
 
4494
+ int mmq_x, mmq_y, nwarps;
4158
4495
  if (compute_capability >= CC_TURING) {
4159
- const int mmq_x = 128;
4160
- const int mmq_y = 64;
4161
- const int nwarps = 4;
4162
-
4163
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
- const dim3 block_nums(block_num_x, block_num_y, 1);
4166
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
-
4168
- if (nrows_x % mmq_y == 0) {
4169
- const bool need_check = false;
4170
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
- } else {
4174
- const bool need_check = true;
4175
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
- }
4496
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4498
+ nwarps = NWARPS_Q8_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4502
+ nwarps = NWARPS_Q8_0_PASCAL;
4179
4503
  } else {
4180
- const int mmq_x = 64;
4181
- const int mmq_y = 64;
4182
- const int nwarps = 8;
4183
-
4184
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
- const dim3 block_nums(block_num_x, block_num_y, 1);
4187
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
-
4189
- if (nrows_x % mmq_y == 0) {
4190
- const bool need_check = false;
4191
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
- } else {
4195
- const bool need_check = true;
4196
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
- }
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4509
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4511
+
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4516
+ } else {
4517
+ const bool need_check = true;
4518
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4200
4520
  }
4201
4521
  }
4202
4522
 
@@ -4208,48 +4528,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4208
4528
  CUDA_CHECK(cudaGetDevice(&id));
4209
4529
  const int compute_capability = g_compute_capabilities[id];
4210
4530
 
4531
+ int mmq_x, mmq_y, nwarps;
4211
4532
  if (compute_capability >= CC_TURING) {
4212
- const int mmq_x = 64;
4213
- const int mmq_y = 128;
4214
- const int nwarps = 4;
4215
-
4216
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
- const dim3 block_nums(block_num_x, block_num_y, 1);
4219
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
-
4221
- if (nrows_x % mmq_y == 0) {
4222
- const bool need_check = false;
4223
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
- } else {
4227
- const bool need_check = true;
4228
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
- }
4533
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4534
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4535
+ nwarps = NWARPS_Q2_K_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4538
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4539
+ nwarps = NWARPS_Q2_K_PASCAL;
4232
4540
  } else {
4233
- const int mmq_x = 64;
4234
- const int mmq_y = 64;
4235
- const int nwarps = 8;
4236
-
4237
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
- const dim3 block_nums(block_num_x, block_num_y, 1);
4240
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
-
4242
- if (nrows_x % mmq_y == 0) {
4243
- const bool need_check = false;
4244
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
- } else {
4248
- const bool need_check = true;
4249
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
- }
4541
+ GGML_ASSERT(false);
4542
+ }
4543
+
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4546
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4548
+
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4553
+ } else {
4554
+ const bool need_check = true;
4555
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4253
4557
  }
4254
4558
  }
4255
4559
 
@@ -4261,48 +4565,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4261
4565
  CUDA_CHECK(cudaGetDevice(&id));
4262
4566
  const int compute_capability = g_compute_capabilities[id];
4263
4567
 
4568
+ int mmq_x, mmq_y, nwarps;
4264
4569
  if (compute_capability >= CC_TURING) {
4265
- const int mmq_x = 128;
4266
- const int mmq_y = 128;
4267
- const int nwarps = 4;
4268
-
4269
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
- const dim3 block_nums(block_num_x, block_num_y, 1);
4272
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
-
4274
- if (nrows_x % mmq_y == 0) {
4275
- const bool need_check = false;
4276
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
- } else {
4280
- const bool need_check = true;
4281
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
- }
4570
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4571
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4572
+ nwarps = NWARPS_Q3_K_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4575
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4576
+ nwarps = NWARPS_Q3_K_PASCAL;
4285
4577
  } else {
4286
- const int mmq_x = 64;
4287
- const int mmq_y = 64;
4288
- const int nwarps = 8;
4289
-
4290
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
- const dim3 block_nums(block_num_x, block_num_y, 1);
4293
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
-
4295
- if (nrows_x % mmq_y == 0) {
4296
- const bool need_check = false;
4297
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
- } else {
4301
- const bool need_check = true;
4302
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
- }
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4583
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4585
+
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4590
+ } else {
4591
+ const bool need_check = true;
4592
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4306
4594
  }
4307
4595
  }
4308
4596
 
@@ -4314,48 +4602,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4314
4602
  CUDA_CHECK(cudaGetDevice(&id));
4315
4603
  const int compute_capability = g_compute_capabilities[id];
4316
4604
 
4605
+ int mmq_x, mmq_y, nwarps;
4317
4606
  if (compute_capability >= CC_TURING) {
4318
- const int mmq_x = 64;
4319
- const int mmq_y = 128;
4320
- const int nwarps = 4;
4321
-
4322
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
- const dim3 block_nums(block_num_x, block_num_y, 1);
4325
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
-
4327
- if (nrows_x % mmq_y == 0) {
4328
- const bool need_check = false;
4329
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
- } else {
4333
- const bool need_check = true;
4334
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
- }
4607
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4608
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4609
+ nwarps = NWARPS_Q4_K_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4612
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4613
+ nwarps = NWARPS_Q4_K_PASCAL;
4338
4614
  } else {
4339
- const int mmq_x = 32;
4340
- const int mmq_y = 64;
4341
- const int nwarps = 8;
4342
-
4343
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
- const dim3 block_nums(block_num_x, block_num_y, 1);
4346
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
-
4348
- if (nrows_x % mmq_y == 0) {
4349
- const bool need_check = false;
4350
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
- } else {
4354
- const bool need_check = true;
4355
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
- }
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4620
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4622
+
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4627
+ } else {
4628
+ const bool need_check = true;
4629
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4631
  }
4360
4632
  }
4361
4633
 
@@ -4367,48 +4639,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4367
4639
  CUDA_CHECK(cudaGetDevice(&id));
4368
4640
  const int compute_capability = g_compute_capabilities[id];
4369
4641
 
4642
+ int mmq_x, mmq_y, nwarps;
4370
4643
  if (compute_capability >= CC_TURING) {
4371
- const int mmq_x = 64;
4372
- const int mmq_y = 128;
4373
- const int nwarps = 4;
4374
-
4375
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
- const dim3 block_nums(block_num_x, block_num_y, 1);
4378
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
-
4380
- if (nrows_x % mmq_y == 0) {
4381
- const bool need_check = false;
4382
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
- } else {
4386
- const bool need_check = true;
4387
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
- }
4644
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4645
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4646
+ nwarps = NWARPS_Q5_K_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4649
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4650
+ nwarps = NWARPS_Q5_K_PASCAL;
4391
4651
  } else {
4392
- const int mmq_x = 64;
4393
- const int mmq_y = 64;
4394
- const int nwarps = 8;
4395
-
4396
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
- const dim3 block_nums(block_num_x, block_num_y, 1);
4399
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
-
4401
- if (nrows_x % mmq_y == 0) {
4402
- const bool need_check = false;
4403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
- } else {
4407
- const bool need_check = true;
4408
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
- }
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4657
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4659
+
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4664
+ } else {
4665
+ const bool need_check = true;
4666
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4412
4668
  }
4413
4669
  }
4414
4670
 
@@ -4420,48 +4676,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4420
4676
  CUDA_CHECK(cudaGetDevice(&id));
4421
4677
  const int compute_capability = g_compute_capabilities[id];
4422
4678
 
4679
+ int mmq_x, mmq_y, nwarps;
4423
4680
  if (compute_capability >= CC_TURING) {
4424
- const int mmq_x = 64;
4425
- const int mmq_y = 64;
4426
- const int nwarps = 4;
4427
-
4428
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
- const dim3 block_nums(block_num_x, block_num_y, 1);
4431
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
-
4433
- if (nrows_x % mmq_y == 0) {
4434
- const bool need_check = false;
4435
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
- } else {
4439
- const bool need_check = true;
4440
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
- }
4681
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4683
+ nwarps = NWARPS_Q6_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4687
+ nwarps = NWARPS_Q6_K_PASCAL;
4444
4688
  } else {
4445
- const int mmq_x = 32;
4446
- const int mmq_y = 64;
4447
- const int nwarps = 8;
4448
-
4449
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
- const dim3 block_nums(block_num_x, block_num_y, 1);
4452
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
-
4454
- if (nrows_x % mmq_y == 0) {
4455
- const bool need_check = false;
4456
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
- } else {
4460
- const bool need_check = true;
4461
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
- }
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4694
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4696
+
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4701
+ } else {
4702
+ const bool need_check = true;
4703
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4465
4705
  }
4466
4706
  }
4467
4707
 
@@ -6229,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6229
6469
  func(tensor->src[0], tensor->src[1], tensor);
6230
6470
  return true;
6231
6471
  }
6472
+
6473
+ int ggml_cuda_get_device_count() {
6474
+ int device_count;
6475
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6476
+ return device_count;
6477
+ }
6478
+
6479
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6480
+ cudaDeviceProp prop;
6481
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6482
+ snprintf(description, description_size, "%s", prop.name);
6483
+ }