llama_cpp 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1399,6 +1399,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1399
1399
  // second part effectively subtracts 8 from each quant value
1400
1400
  return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
1401
1401
  #else
1402
+ assert(false);
1402
1403
  return 0.0f; // only to satisfy the compiler
1403
1404
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1404
1405
  }
@@ -1436,6 +1437,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
1436
1437
  // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
1437
1438
  return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
1438
1439
  #else
1440
+ assert(false);
1439
1441
  return 0.0f; // only to satisfy the compiler
1440
1442
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1441
1443
  }
@@ -1471,6 +1473,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
1471
1473
  // second part effectively subtracts 16 from each quant value
1472
1474
  return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
1473
1475
  #else
1476
+ assert(false);
1474
1477
  return 0.0f; // only to satisfy the compiler
1475
1478
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1476
1479
  }
@@ -1516,6 +1519,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
1516
1519
  return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
1517
1520
 
1518
1521
  #else
1522
+ assert(false);
1519
1523
  return 0.0f; // only to satisfy the compiler
1520
1524
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1521
1525
  }
@@ -1537,6 +1541,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
1537
1541
 
1538
1542
  return d8_0*d8_1 * sumi;
1539
1543
  #else
1544
+ assert(false);
1540
1545
  return 0.0f; // only to satisfy the compiler
1541
1546
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1542
1547
  }
@@ -1567,6 +1572,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
1567
1572
  // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
1568
1573
  return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
1569
1574
  #else
1575
+ assert(false);
1570
1576
  return 0.0f; // only to satisfy the compiler
1571
1577
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1572
1578
  }
@@ -1602,6 +1608,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
1602
1608
 
1603
1609
  return dm2f.x*sumf_d - dm2f.y*sumf_m;
1604
1610
  #else
1611
+ assert(false);
1605
1612
  return 0.0f; // only to satisfy the compiler
1606
1613
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1607
1614
  }
@@ -1639,6 +1646,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
1639
1646
 
1640
1647
  return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
1641
1648
  #else
1649
+ assert(false);
1642
1650
  return 0.0f; // only to satisfy the compiler
1643
1651
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1644
1652
  }
@@ -1679,6 +1687,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
1679
1687
 
1680
1688
  return d3 * sumf;
1681
1689
  #else
1690
+ assert(false);
1682
1691
  return 0.0f; // only to satisfy the compiler
1683
1692
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1684
1693
  }
@@ -1704,6 +1713,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
1704
1713
 
1705
1714
  return d3*d8 * sumi;
1706
1715
  #else
1716
+ assert(false);
1707
1717
  return 0.0f; // only to satisfy the compiler
1708
1718
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1709
1719
  }
@@ -1737,12 +1747,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
1737
1747
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1738
1748
 
1739
1749
  #else
1750
+ assert(false);
1740
1751
  return 0.0f; // only to satisfy the compiler
1741
1752
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1742
1753
  }
1743
1754
 
1744
1755
  // contiguous u/y values
1745
- // also used for q5_K
1746
1756
  static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1747
1757
  const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1748
1758
  const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
@@ -1752,19 +1762,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1752
1762
  float sumf_m = 0.0f;
1753
1763
 
1754
1764
  #pragma unroll
1755
- for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
1765
+ for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
1756
1766
  int sumi_d = 0;
1757
1767
 
1758
1768
  #pragma unroll
1759
- for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
1760
- sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
1761
- sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
1769
+ for (int j = 0; j < QI8_1; ++j) {
1770
+ sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
1762
1771
  }
1763
1772
 
1764
- const float2 ds8f = __half22float2(ds8[i0 / 4]);
1773
+ const float2 ds8f = __half22float2(ds8[i]);
1765
1774
 
1766
- sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
1767
- sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
1775
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1776
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1768
1777
  }
1769
1778
 
1770
1779
  const float2 dm4f = __half22float2(dm4);
@@ -1772,6 +1781,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1772
1781
  return dm4f.x*sumf_d - dm4f.y*sumf_m;
1773
1782
 
1774
1783
  #else
1784
+ assert(false);
1775
1785
  return 0.0f; // only to satisfy the compiler
1776
1786
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1777
1787
  }
@@ -1780,7 +1790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
1780
1790
  #define VDR_Q5_K_Q8_1_MMQ 8
1781
1791
 
1782
1792
  // contiguous v/x values
1783
- static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1793
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
1784
1794
  const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1785
1795
  const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
1786
1796
 
@@ -1812,6 +1822,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
1812
1822
  return dm5f.x*sumf_d - dm5f.y*sumf_m;
1813
1823
 
1814
1824
  #else
1825
+ assert(false);
1826
+ return 0.0f; // only to satisfy the compiler
1827
+ #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1828
+ }
1829
+
1830
+ // contiguous u/y values
1831
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
1832
+ const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
1833
+ const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
1834
+
1835
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1836
+ float sumf_d = 0.0f;
1837
+ float sumf_m = 0.0f;
1838
+
1839
+ #pragma unroll
1840
+ for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
1841
+ int sumi_d = 0;
1842
+
1843
+ #pragma unroll
1844
+ for (int j = 0; j < QI8_1; ++j) {
1845
+ sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
1846
+ }
1847
+
1848
+ const float2 ds8f = __half22float2(ds8[i]);
1849
+
1850
+ sumf_d += ds8f.x * (sc[i] * sumi_d);
1851
+ sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
1852
+ }
1853
+
1854
+ const float2 dm4f = __half22float2(dm4);
1855
+
1856
+ return dm4f.x*sumf_d - dm4f.y*sumf_m;
1857
+
1858
+ #else
1859
+ assert(false);
1815
1860
  return 0.0f; // only to satisfy the compiler
1816
1861
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1817
1862
  }
@@ -1842,6 +1887,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
1842
1887
 
1843
1888
  return d*sumf;
1844
1889
  #else
1890
+ assert(false);
1845
1891
  return 0.0f; // only to satisfy the compiler
1846
1892
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1847
1893
  }
@@ -1873,6 +1919,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
1873
1919
  return d6 * sumf_d;
1874
1920
 
1875
1921
  #else
1922
+ assert(false);
1876
1923
  return 0.0f; // only to satisfy the compiler
1877
1924
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1878
1925
  }
@@ -2722,6 +2769,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2722
2769
  return dall * sumf_d - dmin * sumf_m;
2723
2770
 
2724
2771
  #else
2772
+ assert(false);
2725
2773
  return 0.0f; // only to satisfy the compiler
2726
2774
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2727
2775
 
@@ -2808,18 +2856,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
2808
2856
  const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
2809
2857
  const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
2810
2858
 
2811
- int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
2812
-
2813
- #pragma unroll
2814
- for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
2815
- v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
2816
- v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
2817
- }
2818
-
2819
2859
  const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
2820
2860
 
2821
2861
  const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
2822
- return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2862
+ return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
2863
+ x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
2823
2864
  }
2824
2865
 
2825
2866
  static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
@@ -2866,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2866
2907
  u[2*i+1] = q8[4];
2867
2908
  }
2868
2909
 
2869
- return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
2910
+ return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
2870
2911
 
2871
2912
  #else
2872
2913
 
@@ -2905,6 +2946,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2905
2946
  return d * sumf_d;
2906
2947
 
2907
2948
  #else
2949
+ assert(false);
2908
2950
  return 0.0f; // only to satisfy the compiler
2909
2951
  #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2910
2952
 
@@ -3008,7 +3050,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
3008
3050
 
3009
3051
  const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
3010
3052
  const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
3011
- return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3053
+ return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
3054
+ x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
3012
3055
  }
3013
3056
 
3014
3057
  static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
@@ -3135,7 +3178,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
3135
3178
 
3136
3179
  template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
3137
3180
  allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
3138
- static __global__ void mul_mat_q(
3181
+ static __device__ __forceinline__ void mul_mat_q(
3139
3182
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3140
3183
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3141
3184
 
@@ -3150,7 +3193,6 @@ static __global__ void mul_mat_q(
3150
3193
 
3151
3194
  const int row_dst_0 = blockIdx.x*mmq_y;
3152
3195
  const int & row_x_0 = row_dst_0;
3153
- const int row_dst = row_dst_0 + threadIdx.x;
3154
3196
 
3155
3197
  const int col_dst_0 = blockIdx.y*mmq_x;
3156
3198
  const int & col_y_0 = col_dst_0;
@@ -3223,11 +3265,7 @@ static __global__ void mul_mat_q(
3223
3265
  }
3224
3266
  }
3225
3267
 
3226
-
3227
- if (row_dst >= nrows_dst) {
3228
- return;
3229
- }
3230
-
3268
+ #pragma unroll
3231
3269
  for (int j = 0; j < mmq_x; j += nwarps) {
3232
3270
  const int col_dst = col_dst_0 + j + threadIdx.y;
3233
3271
 
@@ -3235,12 +3273,375 @@ static __global__ void mul_mat_q(
3235
3273
  return;
3236
3274
  }
3237
3275
 
3276
+ #pragma unroll
3238
3277
  for (int i = 0; i < mmq_y; i += WARP_SIZE) {
3239
- dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
3278
+ const int row_dst = row_dst_0 + threadIdx.x + i;
3279
+
3280
+ if (row_dst >= nrows_dst) {
3281
+ continue;
3282
+ }
3283
+
3284
+ dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
3240
3285
  }
3241
3286
  }
3242
3287
  }
3243
3288
 
3289
+ #define MMQ_X_Q4_0_AMPERE 64
3290
+ #define MMQ_Y_Q4_0_AMPERE 128
3291
+ #define NWARPS_Q4_0_AMPERE 4
3292
+ #define MMQ_X_Q4_0_PASCAL 64
3293
+ #define MMQ_Y_Q4_0_PASCAL 64
3294
+ #define NWARPS_Q4_0_PASCAL 8
3295
+
3296
+ template <bool need_check> static __global__ void mul_mat_q4_0(
3297
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3298
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3299
+
3300
+ #if __CUDA_ARCH__ >= CC_TURING
3301
+ const int mmq_x = MMQ_X_Q4_0_AMPERE;
3302
+ const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3303
+ const int nwarps = NWARPS_Q4_0_AMPERE;
3304
+
3305
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3306
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3307
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3308
+
3309
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3310
+ const int mmq_x = MMQ_X_Q4_0_PASCAL;
3311
+ const int mmq_y = MMQ_Y_Q4_0_PASCAL;
3312
+ const int nwarps = NWARPS_Q4_0_PASCAL;
3313
+
3314
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3315
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3316
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3317
+ #else
3318
+ (void) vec_dot_q4_0_q8_1_mul_mat;
3319
+ assert(false);
3320
+ #endif // __CUDA_ARCH__ >= CC_TURING
3321
+ }
3322
+
3323
+ #define MMQ_X_Q4_1_AMPERE 64
3324
+ #define MMQ_Y_Q4_1_AMPERE 128
3325
+ #define NWARPS_Q4_1_AMPERE 4
3326
+ #define MMQ_X_Q4_1_PASCAL 64
3327
+ #define MMQ_Y_Q4_1_PASCAL 64
3328
+ #define NWARPS_Q4_1_PASCAL 8
3329
+
3330
+ template <bool need_check> static __global__ void
3331
+ #if __CUDA_ARCH__ < CC_TURING
3332
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3333
+ #endif // __CUDA_ARCH__ < CC_TURING
3334
+ mul_mat_q4_1(
3335
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3336
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3337
+
3338
+ #if __CUDA_ARCH__ >= CC_TURING
3339
+ const int mmq_x = MMQ_X_Q4_1_AMPERE;
3340
+ const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3341
+ const int nwarps = NWARPS_Q4_1_AMPERE;
3342
+
3343
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3344
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3345
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3346
+
3347
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3348
+ const int mmq_x = MMQ_X_Q4_1_PASCAL;
3349
+ const int mmq_y = MMQ_Y_Q4_1_PASCAL;
3350
+ const int nwarps = NWARPS_Q4_1_PASCAL;
3351
+
3352
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3353
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3354
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3355
+ #else
3356
+ (void) vec_dot_q4_1_q8_1_mul_mat;
3357
+ assert(false);
3358
+ #endif // __CUDA_ARCH__ >= CC_TURING
3359
+ }
3360
+
3361
+ #define MMQ_X_Q5_0_AMPERE 128
3362
+ #define MMQ_Y_Q5_0_AMPERE 64
3363
+ #define NWARPS_Q5_0_AMPERE 4
3364
+ #define MMQ_X_Q5_0_PASCAL 64
3365
+ #define MMQ_Y_Q5_0_PASCAL 64
3366
+ #define NWARPS_Q5_0_PASCAL 8
3367
+
3368
+ template <bool need_check> static __global__ void mul_mat_q5_0(
3369
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3370
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3371
+
3372
+ #if __CUDA_ARCH__ >= CC_TURING
3373
+ const int mmq_x = MMQ_X_Q5_0_AMPERE;
3374
+ const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3375
+ const int nwarps = NWARPS_Q5_0_AMPERE;
3376
+
3377
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3378
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3379
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3380
+
3381
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3382
+ const int mmq_x = MMQ_X_Q5_0_PASCAL;
3383
+ const int mmq_y = MMQ_Y_Q5_0_PASCAL;
3384
+ const int nwarps = NWARPS_Q5_0_PASCAL;
3385
+
3386
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3387
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3388
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3389
+ #else
3390
+ (void) vec_dot_q5_0_q8_1_mul_mat;
3391
+ assert(false);
3392
+ #endif // __CUDA_ARCH__ >= CC_TURING
3393
+ }
3394
+
3395
+ #define MMQ_X_Q5_1_AMPERE 128
3396
+ #define MMQ_Y_Q5_1_AMPERE 64
3397
+ #define NWARPS_Q5_1_AMPERE 4
3398
+ #define MMQ_X_Q5_1_PASCAL 64
3399
+ #define MMQ_Y_Q5_1_PASCAL 64
3400
+ #define NWARPS_Q5_1_PASCAL 8
3401
+
3402
+ template <bool need_check> static __global__ void mul_mat_q5_1(
3403
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3404
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3405
+
3406
+ #if __CUDA_ARCH__ >= CC_TURING
3407
+ const int mmq_x = MMQ_X_Q5_1_AMPERE;
3408
+ const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3409
+ const int nwarps = NWARPS_Q5_1_AMPERE;
3410
+
3411
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3412
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3413
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3414
+
3415
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3416
+ const int mmq_x = MMQ_X_Q5_1_PASCAL;
3417
+ const int mmq_y = MMQ_Y_Q5_1_PASCAL;
3418
+ const int nwarps = NWARPS_Q5_1_PASCAL;
3419
+
3420
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3421
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3422
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3423
+ #else
3424
+ (void) vec_dot_q5_1_q8_1_mul_mat;
3425
+ assert(false);
3426
+ #endif // __CUDA_ARCH__ >= CC_TURING
3427
+ }
3428
+
3429
+ #define MMQ_X_Q8_0_AMPERE 128
3430
+ #define MMQ_Y_Q8_0_AMPERE 64
3431
+ #define NWARPS_Q8_0_AMPERE 4
3432
+ #define MMQ_X_Q8_0_PASCAL 64
3433
+ #define MMQ_Y_Q8_0_PASCAL 64
3434
+ #define NWARPS_Q8_0_PASCAL 8
3435
+
3436
+ template <bool need_check> static __global__ void mul_mat_q8_0(
3437
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3438
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3439
+
3440
+ #if __CUDA_ARCH__ >= CC_TURING
3441
+ const int mmq_x = MMQ_X_Q8_0_AMPERE;
3442
+ const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3443
+ const int nwarps = NWARPS_Q8_0_AMPERE;
3444
+
3445
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3446
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3447
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3448
+
3449
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3450
+ const int mmq_x = MMQ_X_Q8_0_PASCAL;
3451
+ const int mmq_y = MMQ_Y_Q8_0_PASCAL;
3452
+ const int nwarps = NWARPS_Q8_0_PASCAL;
3453
+
3454
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3455
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3456
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3457
+ #else
3458
+ (void) vec_dot_q8_0_q8_1_mul_mat;
3459
+ assert(false);
3460
+ #endif // __CUDA_ARCH__ >= CC_TURING
3461
+ }
3462
+
3463
+ #define MMQ_X_Q2_K_AMPERE 64
3464
+ #define MMQ_Y_Q2_K_AMPERE 128
3465
+ #define NWARPS_Q2_K_AMPERE 4
3466
+ #define MMQ_X_Q2_K_PASCAL 64
3467
+ #define MMQ_Y_Q2_K_PASCAL 64
3468
+ #define NWARPS_Q2_K_PASCAL 8
3469
+
3470
+ template <bool need_check> static __global__ void mul_mat_q2_K(
3471
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3472
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3473
+
3474
+ #if __CUDA_ARCH__ >= CC_TURING
3475
+ const int mmq_x = MMQ_X_Q2_K_AMPERE;
3476
+ const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3477
+ const int nwarps = NWARPS_Q2_K_AMPERE;
3478
+
3479
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3480
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3481
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3482
+
3483
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3484
+ const int mmq_x = MMQ_X_Q2_K_PASCAL;
3485
+ const int mmq_y = MMQ_Y_Q2_K_PASCAL;
3486
+ const int nwarps = NWARPS_Q2_K_PASCAL;
3487
+
3488
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3489
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3490
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3491
+ #else
3492
+ (void) vec_dot_q2_K_q8_1_mul_mat;
3493
+ assert(false);
3494
+ #endif // __CUDA_ARCH__ >= CC_TURING
3495
+ }
3496
+
3497
+ #define MMQ_X_Q3_K_AMPERE 128
3498
+ #define MMQ_Y_Q3_K_AMPERE 128
3499
+ #define NWARPS_Q3_K_AMPERE 4
3500
+ #define MMQ_X_Q3_K_PASCAL 64
3501
+ #define MMQ_Y_Q3_K_PASCAL 64
3502
+ #define NWARPS_Q3_K_PASCAL 8
3503
+
3504
+ template <bool need_check> static __global__ void
3505
+ #if __CUDA_ARCH__ < CC_TURING
3506
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3507
+ #endif // __CUDA_ARCH__ < CC_TURING
3508
+ mul_mat_q3_K(
3509
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3510
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3511
+
3512
+ #if __CUDA_ARCH__ >= CC_TURING
3513
+ const int mmq_x = MMQ_X_Q3_K_AMPERE;
3514
+ const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3515
+ const int nwarps = NWARPS_Q3_K_AMPERE;
3516
+
3517
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3518
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3520
+
3521
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3522
+ const int mmq_x = MMQ_X_Q3_K_PASCAL;
3523
+ const int mmq_y = MMQ_Y_Q3_K_PASCAL;
3524
+ const int nwarps = NWARPS_Q3_K_PASCAL;
3525
+
3526
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3527
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3528
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3529
+ #else
3530
+ (void) vec_dot_q3_K_q8_1_mul_mat;
3531
+ assert(false);
3532
+ #endif // __CUDA_ARCH__ >= CC_TURING
3533
+ }
3534
+
3535
+ #define MMQ_X_Q4_K_AMPERE 64
3536
+ #define MMQ_Y_Q4_K_AMPERE 128
3537
+ #define NWARPS_Q4_K_AMPERE 4
3538
+ #define MMQ_X_Q4_K_PASCAL 64
3539
+ #define MMQ_Y_Q4_K_PASCAL 64
3540
+ #define NWARPS_Q4_K_PASCAL 8
3541
+
3542
+ template <bool need_check> static __global__ void
3543
+ #if __CUDA_ARCH__ < CC_TURING
3544
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3545
+ #endif // __CUDA_ARCH__ < CC_TURING
3546
+ mul_mat_q4_K(
3547
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3548
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3549
+
3550
+ #if __CUDA_ARCH__ >= CC_TURING
3551
+ const int mmq_x = MMQ_X_Q4_K_AMPERE;
3552
+ const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3553
+ const int nwarps = NWARPS_Q4_K_AMPERE;
3554
+
3555
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3556
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3557
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3558
+
3559
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3560
+ const int mmq_x = MMQ_X_Q4_K_PASCAL;
3561
+ const int mmq_y = MMQ_Y_Q4_K_PASCAL;
3562
+ const int nwarps = NWARPS_Q4_K_PASCAL;
3563
+
3564
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3565
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3566
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3567
+ #else
3568
+ (void) vec_dot_q4_K_q8_1_mul_mat;
3569
+ assert(false);
3570
+ #endif // __CUDA_ARCH__ >= CC_TURING
3571
+ }
3572
+
3573
+ #define MMQ_X_Q5_K_AMPERE 64
3574
+ #define MMQ_Y_Q5_K_AMPERE 128
3575
+ #define NWARPS_Q5_K_AMPERE 4
3576
+ #define MMQ_X_Q5_K_PASCAL 64
3577
+ #define MMQ_Y_Q5_K_PASCAL 64
3578
+ #define NWARPS_Q5_K_PASCAL 8
3579
+
3580
+ template <bool need_check> static __global__ void mul_mat_q5_K(
3581
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3582
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3583
+
3584
+ #if __CUDA_ARCH__ >= CC_TURING
3585
+ const int mmq_x = MMQ_X_Q5_K_AMPERE;
3586
+ const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3587
+ const int nwarps = NWARPS_Q5_K_AMPERE;
3588
+
3589
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3590
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3591
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3592
+
3593
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3594
+ const int mmq_x = MMQ_X_Q5_K_PASCAL;
3595
+ const int mmq_y = MMQ_Y_Q5_K_PASCAL;
3596
+ const int nwarps = NWARPS_Q5_K_PASCAL;
3597
+
3598
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
3599
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
3600
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3601
+ #else
3602
+ (void) vec_dot_q5_K_q8_1_mul_mat;
3603
+ assert(false);
3604
+ #endif // __CUDA_ARCH__ >= CC_TURING
3605
+ }
3606
+
3607
+ #define MMQ_X_Q6_K_AMPERE 64
3608
+ #define MMQ_Y_Q6_K_AMPERE 64
3609
+ #define NWARPS_Q6_K_AMPERE 4
3610
+ #define MMQ_X_Q6_K_PASCAL 64
3611
+ #define MMQ_Y_Q6_K_PASCAL 64
3612
+ #define NWARPS_Q6_K_PASCAL 8
3613
+
3614
+ template <bool need_check> static __global__ void
3615
+ #if __CUDA_ARCH__ < CC_TURING
3616
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3617
+ #endif // __CUDA_ARCH__ < CC_TURING
3618
+ mul_mat_q6_K(
3619
+ const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3620
+ const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3621
+
3622
+ #if __CUDA_ARCH__ >= CC_TURING
3623
+ const int mmq_x = MMQ_X_Q6_K_AMPERE;
3624
+ const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3625
+ const int nwarps = NWARPS_Q6_K_AMPERE;
3626
+
3627
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3628
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3629
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3630
+
3631
+ #elif __CUDA_ARCH__ >= MIN_CC_DP4A
3632
+ const int mmq_x = MMQ_X_Q6_K_PASCAL;
3633
+ const int mmq_y = MMQ_Y_Q6_K_PASCAL;
3634
+ const int nwarps = NWARPS_Q6_K_PASCAL;
3635
+
3636
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
3637
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
3638
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3639
+ #else
3640
+ (void) vec_dot_q6_K_q8_1_mul_mat;
3641
+ assert(false);
3642
+ #endif // __CUDA_ARCH__ >= CC_TURING
3643
+ }
3644
+
3244
3645
  template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
3245
3646
  static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
3246
3647
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -3942,48 +4343,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
3942
4343
  CUDA_CHECK(cudaGetDevice(&id));
3943
4344
  const int compute_capability = g_compute_capabilities[id];
3944
4345
 
4346
+ int mmq_x, mmq_y, nwarps;
3945
4347
  if (compute_capability >= CC_TURING) {
3946
- const int mmq_x = 64;
3947
- const int mmq_y = 128;
3948
- const int nwarps = 4;
3949
-
3950
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3951
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3952
- const dim3 block_nums(block_num_x, block_num_y, 1);
3953
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3954
-
3955
- if (nrows_x % mmq_y == 0) {
3956
- const bool need_check = false;
3957
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3958
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3959
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3960
- } else {
3961
- const bool need_check = true;
3962
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3963
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3964
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3965
- }
4348
+ mmq_x = MMQ_X_Q4_0_AMPERE;
4349
+ mmq_y = MMQ_Y_Q4_0_AMPERE;
4350
+ nwarps = NWARPS_Q4_0_AMPERE;
4351
+ } else if (compute_capability >= MIN_CC_DP4A) {
4352
+ mmq_x = MMQ_X_Q4_0_PASCAL;
4353
+ mmq_y = MMQ_Y_Q4_0_PASCAL;
4354
+ nwarps = NWARPS_Q4_0_PASCAL;
3966
4355
  } else {
3967
- const int mmq_x = 64;
3968
- const int mmq_y = 64;
3969
- const int nwarps = 4;
3970
-
3971
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
3972
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
3973
- const dim3 block_nums(block_num_x, block_num_y, 1);
3974
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
3975
-
3976
- if (nrows_x % mmq_y == 0) {
3977
- const bool need_check = false;
3978
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3979
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3980
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3981
- } else {
3982
- const bool need_check = true;
3983
- mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3984
- load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3985
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3986
- }
4356
+ GGML_ASSERT(false);
4357
+ }
4358
+
4359
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4360
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4361
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4362
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4363
+
4364
+ if (nrows_x % mmq_y == 0) {
4365
+ const bool need_check = false;
4366
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4367
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4368
+ } else {
4369
+ const bool need_check = true;
4370
+ mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
4371
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3987
4372
  }
3988
4373
  }
3989
4374
 
@@ -3995,49 +4380,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
3995
4380
  CUDA_CHECK(cudaGetDevice(&id));
3996
4381
  const int compute_capability = g_compute_capabilities[id];
3997
4382
 
4383
+ int mmq_x, mmq_y, nwarps;
3998
4384
  if (compute_capability >= CC_TURING) {
3999
- const int mmq_x = 64;
4000
- const int mmq_y = 128;
4001
- const int nwarps = 4;
4002
-
4003
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4004
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4005
- const dim3 block_nums(block_num_x, block_num_y, 1);
4006
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4007
-
4008
- if (nrows_x % mmq_y == 0) {
4009
- const bool need_check = false;
4010
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4011
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4012
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4013
- } else {
4014
- const bool need_check = true;
4015
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4016
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4017
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4018
- }
4385
+ mmq_x = MMQ_X_Q4_1_AMPERE;
4386
+ mmq_y = MMQ_Y_Q4_1_AMPERE;
4387
+ nwarps = NWARPS_Q4_1_AMPERE;
4388
+ } else if (compute_capability >= MIN_CC_DP4A) {
4389
+ mmq_x = MMQ_X_Q4_1_PASCAL;
4390
+ mmq_y = MMQ_Y_Q4_1_PASCAL;
4391
+ nwarps = NWARPS_Q4_1_PASCAL;
4019
4392
  } else {
4020
- const int mmq_x = 64;
4021
- const int mmq_y = 64;
4022
- const int nwarps = 8;
4023
-
4024
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4025
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4026
- const dim3 block_nums(block_num_x, block_num_y, 1);
4027
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4028
-
4029
- if (nrows_x % mmq_y == 0) {
4030
- const bool need_check = false;
4031
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4032
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4033
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4034
- } else {
4035
- const bool need_check = true;
4036
- mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
4037
- load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
4038
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4039
- }
4393
+ GGML_ASSERT(false);
4394
+ }
4040
4395
 
4396
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4399
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
+
4401
+ if (nrows_x % mmq_y == 0) {
4402
+ const bool need_check = false;
4403
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4404
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4405
+ } else {
4406
+ const bool need_check = true;
4407
+ mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
4408
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4041
4409
  }
4042
4410
  }
4043
4411
 
@@ -4049,48 +4417,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4049
4417
  CUDA_CHECK(cudaGetDevice(&id));
4050
4418
  const int compute_capability = g_compute_capabilities[id];
4051
4419
 
4420
+ int mmq_x, mmq_y, nwarps;
4052
4421
  if (compute_capability >= CC_TURING) {
4053
- const int mmq_x = 128;
4054
- const int mmq_y = 64;
4055
- const int nwarps = 4;
4056
-
4057
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4058
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4059
- const dim3 block_nums(block_num_x, block_num_y, 1);
4060
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4061
-
4062
- if (nrows_x % mmq_y == 0) {
4063
- const bool need_check = false;
4064
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4065
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4066
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4067
- } else {
4068
- const bool need_check = true;
4069
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4070
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4071
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4072
- }
4422
+ mmq_x = MMQ_X_Q5_0_AMPERE;
4423
+ mmq_y = MMQ_Y_Q5_0_AMPERE;
4424
+ nwarps = NWARPS_Q5_0_AMPERE;
4425
+ } else if (compute_capability >= MIN_CC_DP4A) {
4426
+ mmq_x = MMQ_X_Q5_0_PASCAL;
4427
+ mmq_y = MMQ_Y_Q5_0_PASCAL;
4428
+ nwarps = NWARPS_Q5_0_PASCAL;
4073
4429
  } else {
4074
- const int mmq_x = 64;
4075
- const int mmq_y = 64;
4076
- const int nwarps = 8;
4077
-
4078
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4079
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4080
- const dim3 block_nums(block_num_x, block_num_y, 1);
4081
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4082
-
4083
- if (nrows_x % mmq_y == 0) {
4084
- const bool need_check = false;
4085
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4086
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4087
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4088
- } else {
4089
- const bool need_check = true;
4090
- mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
4091
- load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
4092
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4093
- }
4430
+ GGML_ASSERT(false);
4431
+ }
4432
+
4433
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4434
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4435
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4436
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4437
+
4438
+ if (nrows_x % mmq_y == 0) {
4439
+ const bool need_check = false;
4440
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4441
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4442
+ } else {
4443
+ const bool need_check = true;
4444
+ mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
4445
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4094
4446
  }
4095
4447
  }
4096
4448
 
@@ -4102,48 +4454,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4102
4454
  CUDA_CHECK(cudaGetDevice(&id));
4103
4455
  const int compute_capability = g_compute_capabilities[id];
4104
4456
 
4457
+ int mmq_x, mmq_y, nwarps;
4105
4458
  if (compute_capability >= CC_TURING) {
4106
- const int mmq_x = 128;
4107
- const int mmq_y = 64;
4108
- const int nwarps = 8;
4109
-
4110
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4111
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4112
- const dim3 block_nums(block_num_x, block_num_y, 1);
4113
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4114
-
4115
- if (nrows_x % mmq_y == 0) {
4116
- const bool need_check = false;
4117
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4118
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4119
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4120
- } else {
4121
- const bool need_check = true;
4122
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4123
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4124
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4125
- }
4459
+ mmq_x = MMQ_X_Q5_1_AMPERE;
4460
+ mmq_y = MMQ_Y_Q5_1_AMPERE;
4461
+ nwarps = NWARPS_Q5_1_AMPERE;
4462
+ } else if (compute_capability >= MIN_CC_DP4A) {
4463
+ mmq_x = MMQ_X_Q5_1_PASCAL;
4464
+ mmq_y = MMQ_Y_Q5_1_PASCAL;
4465
+ nwarps = NWARPS_Q5_1_PASCAL;
4126
4466
  } else {
4127
- const int mmq_x = 64;
4128
- const int mmq_y = 64;
4129
- const int nwarps = 8;
4130
-
4131
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4132
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4133
- const dim3 block_nums(block_num_x, block_num_y, 1);
4134
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4135
-
4136
- if (nrows_x % mmq_y == 0) {
4137
- const bool need_check = false;
4138
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4139
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4140
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4141
- } else {
4142
- const bool need_check = true;
4143
- mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
4144
- load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
4145
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4146
- }
4467
+ GGML_ASSERT(false);
4468
+ }
4469
+
4470
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4471
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4472
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4473
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4474
+
4475
+ if (nrows_x % mmq_y == 0) {
4476
+ const bool need_check = false;
4477
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4478
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4479
+ } else {
4480
+ const bool need_check = true;
4481
+ mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
4482
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4147
4483
  }
4148
4484
  }
4149
4485
 
@@ -4155,48 +4491,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4155
4491
  CUDA_CHECK(cudaGetDevice(&id));
4156
4492
  const int compute_capability = g_compute_capabilities[id];
4157
4493
 
4494
+ int mmq_x, mmq_y, nwarps;
4158
4495
  if (compute_capability >= CC_TURING) {
4159
- const int mmq_x = 128;
4160
- const int mmq_y = 64;
4161
- const int nwarps = 4;
4162
-
4163
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4164
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4165
- const dim3 block_nums(block_num_x, block_num_y, 1);
4166
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4167
-
4168
- if (nrows_x % mmq_y == 0) {
4169
- const bool need_check = false;
4170
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4171
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4172
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4173
- } else {
4174
- const bool need_check = true;
4175
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4176
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4177
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4178
- }
4496
+ mmq_x = MMQ_X_Q8_0_AMPERE;
4497
+ mmq_y = MMQ_Y_Q8_0_AMPERE;
4498
+ nwarps = NWARPS_Q8_0_AMPERE;
4499
+ } else if (compute_capability >= MIN_CC_DP4A) {
4500
+ mmq_x = MMQ_X_Q8_0_PASCAL;
4501
+ mmq_y = MMQ_Y_Q8_0_PASCAL;
4502
+ nwarps = NWARPS_Q8_0_PASCAL;
4179
4503
  } else {
4180
- const int mmq_x = 64;
4181
- const int mmq_y = 64;
4182
- const int nwarps = 8;
4183
-
4184
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4185
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4186
- const dim3 block_nums(block_num_x, block_num_y, 1);
4187
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4188
-
4189
- if (nrows_x % mmq_y == 0) {
4190
- const bool need_check = false;
4191
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4192
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4193
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4194
- } else {
4195
- const bool need_check = true;
4196
- mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
4197
- load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
4198
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4199
- }
4504
+ GGML_ASSERT(false);
4505
+ }
4506
+
4507
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4508
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4509
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4510
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4511
+
4512
+ if (nrows_x % mmq_y == 0) {
4513
+ const bool need_check = false;
4514
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4515
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4516
+ } else {
4517
+ const bool need_check = true;
4518
+ mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
4519
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4200
4520
  }
4201
4521
  }
4202
4522
 
@@ -4208,48 +4528,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4208
4528
  CUDA_CHECK(cudaGetDevice(&id));
4209
4529
  const int compute_capability = g_compute_capabilities[id];
4210
4530
 
4531
+ int mmq_x, mmq_y, nwarps;
4211
4532
  if (compute_capability >= CC_TURING) {
4212
- const int mmq_x = 64;
4213
- const int mmq_y = 128;
4214
- const int nwarps = 4;
4215
-
4216
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4217
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4218
- const dim3 block_nums(block_num_x, block_num_y, 1);
4219
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4220
-
4221
- if (nrows_x % mmq_y == 0) {
4222
- const bool need_check = false;
4223
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4224
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4225
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4226
- } else {
4227
- const bool need_check = true;
4228
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4229
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4230
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4231
- }
4533
+ mmq_x = MMQ_X_Q2_K_AMPERE;
4534
+ mmq_y = MMQ_Y_Q2_K_AMPERE;
4535
+ nwarps = NWARPS_Q2_K_AMPERE;
4536
+ } else if (compute_capability >= MIN_CC_DP4A) {
4537
+ mmq_x = MMQ_X_Q2_K_PASCAL;
4538
+ mmq_y = MMQ_Y_Q2_K_PASCAL;
4539
+ nwarps = NWARPS_Q2_K_PASCAL;
4232
4540
  } else {
4233
- const int mmq_x = 64;
4234
- const int mmq_y = 64;
4235
- const int nwarps = 8;
4236
-
4237
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4238
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4239
- const dim3 block_nums(block_num_x, block_num_y, 1);
4240
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4241
-
4242
- if (nrows_x % mmq_y == 0) {
4243
- const bool need_check = false;
4244
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4245
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4246
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4247
- } else {
4248
- const bool need_check = true;
4249
- mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
4250
- load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
4251
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4252
- }
4541
+ GGML_ASSERT(false);
4542
+ }
4543
+
4544
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4545
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4546
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4547
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4548
+
4549
+ if (nrows_x % mmq_y == 0) {
4550
+ const bool need_check = false;
4551
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4552
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4553
+ } else {
4554
+ const bool need_check = true;
4555
+ mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
4556
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4253
4557
  }
4254
4558
  }
4255
4559
 
@@ -4261,48 +4565,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4261
4565
  CUDA_CHECK(cudaGetDevice(&id));
4262
4566
  const int compute_capability = g_compute_capabilities[id];
4263
4567
 
4568
+ int mmq_x, mmq_y, nwarps;
4264
4569
  if (compute_capability >= CC_TURING) {
4265
- const int mmq_x = 128;
4266
- const int mmq_y = 128;
4267
- const int nwarps = 4;
4268
-
4269
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4270
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4271
- const dim3 block_nums(block_num_x, block_num_y, 1);
4272
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4273
-
4274
- if (nrows_x % mmq_y == 0) {
4275
- const bool need_check = false;
4276
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4277
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4278
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4279
- } else {
4280
- const bool need_check = true;
4281
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4282
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4283
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4284
- }
4570
+ mmq_x = MMQ_X_Q3_K_AMPERE;
4571
+ mmq_y = MMQ_Y_Q3_K_AMPERE;
4572
+ nwarps = NWARPS_Q3_K_AMPERE;
4573
+ } else if (compute_capability >= MIN_CC_DP4A) {
4574
+ mmq_x = MMQ_X_Q3_K_PASCAL;
4575
+ mmq_y = MMQ_Y_Q3_K_PASCAL;
4576
+ nwarps = NWARPS_Q3_K_PASCAL;
4285
4577
  } else {
4286
- const int mmq_x = 64;
4287
- const int mmq_y = 64;
4288
- const int nwarps = 8;
4289
-
4290
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4291
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4292
- const dim3 block_nums(block_num_x, block_num_y, 1);
4293
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4294
-
4295
- if (nrows_x % mmq_y == 0) {
4296
- const bool need_check = false;
4297
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4298
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4299
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4300
- } else {
4301
- const bool need_check = true;
4302
- mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
4303
- load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
4304
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4305
- }
4578
+ GGML_ASSERT(false);
4579
+ }
4580
+
4581
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4582
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4583
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4584
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4585
+
4586
+ if (nrows_x % mmq_y == 0) {
4587
+ const bool need_check = false;
4588
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4589
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4590
+ } else {
4591
+ const bool need_check = true;
4592
+ mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4306
4594
  }
4307
4595
  }
4308
4596
 
@@ -4314,48 +4602,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4314
4602
  CUDA_CHECK(cudaGetDevice(&id));
4315
4603
  const int compute_capability = g_compute_capabilities[id];
4316
4604
 
4605
+ int mmq_x, mmq_y, nwarps;
4317
4606
  if (compute_capability >= CC_TURING) {
4318
- const int mmq_x = 64;
4319
- const int mmq_y = 128;
4320
- const int nwarps = 4;
4321
-
4322
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4323
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4324
- const dim3 block_nums(block_num_x, block_num_y, 1);
4325
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4326
-
4327
- if (nrows_x % mmq_y == 0) {
4328
- const bool need_check = false;
4329
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4330
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4331
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4332
- } else {
4333
- const bool need_check = true;
4334
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4335
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4336
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4337
- }
4607
+ mmq_x = MMQ_X_Q4_K_AMPERE;
4608
+ mmq_y = MMQ_Y_Q4_K_AMPERE;
4609
+ nwarps = NWARPS_Q4_K_AMPERE;
4610
+ } else if (compute_capability >= MIN_CC_DP4A) {
4611
+ mmq_x = MMQ_X_Q4_K_PASCAL;
4612
+ mmq_y = MMQ_Y_Q4_K_PASCAL;
4613
+ nwarps = NWARPS_Q4_K_PASCAL;
4338
4614
  } else {
4339
- const int mmq_x = 32;
4340
- const int mmq_y = 64;
4341
- const int nwarps = 8;
4342
-
4343
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4344
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4345
- const dim3 block_nums(block_num_x, block_num_y, 1);
4346
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4347
-
4348
- if (nrows_x % mmq_y == 0) {
4349
- const bool need_check = false;
4350
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4351
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4352
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4353
- } else {
4354
- const bool need_check = true;
4355
- mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
4356
- load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
4357
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4358
- }
4615
+ GGML_ASSERT(false);
4616
+ }
4617
+
4618
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4619
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4620
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4621
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4622
+
4623
+ if (nrows_x % mmq_y == 0) {
4624
+ const bool need_check = false;
4625
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4626
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4627
+ } else {
4628
+ const bool need_check = true;
4629
+ mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
4630
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4359
4631
  }
4360
4632
  }
4361
4633
 
@@ -4367,48 +4639,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4367
4639
  CUDA_CHECK(cudaGetDevice(&id));
4368
4640
  const int compute_capability = g_compute_capabilities[id];
4369
4641
 
4642
+ int mmq_x, mmq_y, nwarps;
4370
4643
  if (compute_capability >= CC_TURING) {
4371
- const int mmq_x = 64;
4372
- const int mmq_y = 128;
4373
- const int nwarps = 4;
4374
-
4375
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4376
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4377
- const dim3 block_nums(block_num_x, block_num_y, 1);
4378
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4379
-
4380
- if (nrows_x % mmq_y == 0) {
4381
- const bool need_check = false;
4382
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4383
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4384
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4385
- } else {
4386
- const bool need_check = true;
4387
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4388
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4389
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4390
- }
4644
+ mmq_x = MMQ_X_Q5_K_AMPERE;
4645
+ mmq_y = MMQ_Y_Q5_K_AMPERE;
4646
+ nwarps = NWARPS_Q5_K_AMPERE;
4647
+ } else if (compute_capability >= MIN_CC_DP4A) {
4648
+ mmq_x = MMQ_X_Q5_K_PASCAL;
4649
+ mmq_y = MMQ_Y_Q5_K_PASCAL;
4650
+ nwarps = NWARPS_Q5_K_PASCAL;
4391
4651
  } else {
4392
- const int mmq_x = 64;
4393
- const int mmq_y = 64;
4394
- const int nwarps = 8;
4395
-
4396
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4397
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4398
- const dim3 block_nums(block_num_x, block_num_y, 1);
4399
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4400
-
4401
- if (nrows_x % mmq_y == 0) {
4402
- const bool need_check = false;
4403
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4404
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4405
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4406
- } else {
4407
- const bool need_check = true;
4408
- mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4409
- load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4410
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4411
- }
4652
+ GGML_ASSERT(false);
4653
+ }
4654
+
4655
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4656
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4657
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4658
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4659
+
4660
+ if (nrows_x % mmq_y == 0) {
4661
+ const bool need_check = false;
4662
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4663
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4664
+ } else {
4665
+ const bool need_check = true;
4666
+ mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
4667
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4412
4668
  }
4413
4669
  }
4414
4670
 
@@ -4420,48 +4676,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4420
4676
  CUDA_CHECK(cudaGetDevice(&id));
4421
4677
  const int compute_capability = g_compute_capabilities[id];
4422
4678
 
4679
+ int mmq_x, mmq_y, nwarps;
4423
4680
  if (compute_capability >= CC_TURING) {
4424
- const int mmq_x = 64;
4425
- const int mmq_y = 64;
4426
- const int nwarps = 4;
4427
-
4428
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4429
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4430
- const dim3 block_nums(block_num_x, block_num_y, 1);
4431
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4432
-
4433
- if (nrows_x % mmq_y == 0) {
4434
- const bool need_check = false;
4435
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4436
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4437
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4438
- } else {
4439
- const bool need_check = true;
4440
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4441
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4442
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4443
- }
4681
+ mmq_x = MMQ_X_Q6_K_AMPERE;
4682
+ mmq_y = MMQ_Y_Q6_K_AMPERE;
4683
+ nwarps = NWARPS_Q6_K_AMPERE;
4684
+ } else if (compute_capability >= MIN_CC_DP4A) {
4685
+ mmq_x = MMQ_X_Q6_K_PASCAL;
4686
+ mmq_y = MMQ_Y_Q6_K_PASCAL;
4687
+ nwarps = NWARPS_Q6_K_PASCAL;
4444
4688
  } else {
4445
- const int mmq_x = 32;
4446
- const int mmq_y = 64;
4447
- const int nwarps = 8;
4448
-
4449
- const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4450
- const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4451
- const dim3 block_nums(block_num_x, block_num_y, 1);
4452
- const dim3 block_dims(WARP_SIZE, nwarps, 1);
4453
-
4454
- if (nrows_x % mmq_y == 0) {
4455
- const bool need_check = false;
4456
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4457
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4458
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4459
- } else {
4460
- const bool need_check = true;
4461
- mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4462
- load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4463
- <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4464
- }
4689
+ GGML_ASSERT(false);
4690
+ }
4691
+
4692
+ const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
4693
+ const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
4694
+ const dim3 block_nums(block_num_x, block_num_y, 1);
4695
+ const dim3 block_dims(WARP_SIZE, nwarps, 1);
4696
+
4697
+ if (nrows_x % mmq_y == 0) {
4698
+ const bool need_check = false;
4699
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4700
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4701
+ } else {
4702
+ const bool need_check = true;
4703
+ mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
4704
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4465
4705
  }
4466
4706
  }
4467
4707
 
@@ -6229,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6229
6469
  func(tensor->src[0], tensor->src[1], tensor);
6230
6470
  return true;
6231
6471
  }
6472
+
6473
+ int ggml_cuda_get_device_count() {
6474
+ int device_count;
6475
+ CUDA_CHECK(cudaGetDeviceCount(&device_count));
6476
+ return device_count;
6477
+ }
6478
+
6479
+ void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
6480
+ cudaDeviceProp prop;
6481
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
6482
+ snprintf(description, description_size, "%s", prop.name);
6483
+ }