llama_cpp 0.3.7 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +36 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +680 -428
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +73 -128
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +176 -64
- data/ext/llama_cpp/src/llama.h +3 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -1399,6 +1399,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1399
1399
|
// second part effectively subtracts 8 from each quant value
|
1400
1400
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1401
1401
|
#else
|
1402
|
+
assert(false);
|
1402
1403
|
return 0.0f; // only to satisfy the compiler
|
1403
1404
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1404
1405
|
}
|
@@ -1436,6 +1437,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1436
1437
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1437
1438
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1438
1439
|
#else
|
1440
|
+
assert(false);
|
1439
1441
|
return 0.0f; // only to satisfy the compiler
|
1440
1442
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1441
1443
|
}
|
@@ -1471,6 +1473,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1471
1473
|
// second part effectively subtracts 16 from each quant value
|
1472
1474
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1473
1475
|
#else
|
1476
|
+
assert(false);
|
1474
1477
|
return 0.0f; // only to satisfy the compiler
|
1475
1478
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1476
1479
|
}
|
@@ -1516,6 +1519,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1516
1519
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1517
1520
|
|
1518
1521
|
#else
|
1522
|
+
assert(false);
|
1519
1523
|
return 0.0f; // only to satisfy the compiler
|
1520
1524
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1521
1525
|
}
|
@@ -1537,6 +1541,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
1537
1541
|
|
1538
1542
|
return d8_0*d8_1 * sumi;
|
1539
1543
|
#else
|
1544
|
+
assert(false);
|
1540
1545
|
return 0.0f; // only to satisfy the compiler
|
1541
1546
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1542
1547
|
}
|
@@ -1567,6 +1572,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1567
1572
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1568
1573
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1569
1574
|
#else
|
1575
|
+
assert(false);
|
1570
1576
|
return 0.0f; // only to satisfy the compiler
|
1571
1577
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1572
1578
|
}
|
@@ -1602,6 +1608,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
1602
1608
|
|
1603
1609
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
1610
|
#else
|
1611
|
+
assert(false);
|
1605
1612
|
return 0.0f; // only to satisfy the compiler
|
1606
1613
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
1614
|
}
|
@@ -1639,6 +1646,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
1639
1646
|
|
1640
1647
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
1648
|
#else
|
1649
|
+
assert(false);
|
1642
1650
|
return 0.0f; // only to satisfy the compiler
|
1643
1651
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
1652
|
}
|
@@ -1679,6 +1687,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
1679
1687
|
|
1680
1688
|
return d3 * sumf;
|
1681
1689
|
#else
|
1690
|
+
assert(false);
|
1682
1691
|
return 0.0f; // only to satisfy the compiler
|
1683
1692
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
1693
|
}
|
@@ -1704,6 +1713,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
1704
1713
|
|
1705
1714
|
return d3*d8 * sumi;
|
1706
1715
|
#else
|
1716
|
+
assert(false);
|
1707
1717
|
return 0.0f; // only to satisfy the compiler
|
1708
1718
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
1719
|
}
|
@@ -1737,12 +1747,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
1737
1747
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
1748
|
|
1739
1749
|
#else
|
1750
|
+
assert(false);
|
1740
1751
|
return 0.0f; // only to satisfy the compiler
|
1741
1752
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
1753
|
}
|
1743
1754
|
|
1744
1755
|
// contiguous u/y values
|
1745
|
-
// also used for q5_K
|
1746
1756
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
1757
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
1758
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
@@ -1752,19 +1762,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1752
1762
|
float sumf_m = 0.0f;
|
1753
1763
|
|
1754
1764
|
#pragma unroll
|
1755
|
-
for (int
|
1765
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1756
1766
|
int sumi_d = 0;
|
1757
1767
|
|
1758
1768
|
#pragma unroll
|
1759
|
-
for (int
|
1760
|
-
sumi_d = __dp4a(v[
|
1761
|
-
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1769
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1770
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1762
1771
|
}
|
1763
1772
|
|
1764
|
-
const float2 ds8f = __half22float2(ds8[
|
1773
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1765
1774
|
|
1766
|
-
sumf_d += ds8f.x * (sc[
|
1767
|
-
sumf_m += ds8f.y * m[
|
1775
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1776
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1768
1777
|
}
|
1769
1778
|
|
1770
1779
|
const float2 dm4f = __half22float2(dm4);
|
@@ -1772,6 +1781,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1772
1781
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
1782
|
|
1774
1783
|
#else
|
1784
|
+
assert(false);
|
1775
1785
|
return 0.0f; // only to satisfy the compiler
|
1776
1786
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
1787
|
}
|
@@ -1780,7 +1790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1780
1790
|
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
1791
|
|
1782
1792
|
// contiguous v/x values
|
1783
|
-
static __device__ __forceinline__ float
|
1793
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1784
1794
|
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
1795
|
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
1796
|
|
@@ -1812,6 +1822,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
|
1812
1822
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
1823
|
|
1814
1824
|
#else
|
1825
|
+
assert(false);
|
1826
|
+
return 0.0f; // only to satisfy the compiler
|
1827
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
// contiguous u/y values
|
1831
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1832
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1833
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1834
|
+
|
1835
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1836
|
+
float sumf_d = 0.0f;
|
1837
|
+
float sumf_m = 0.0f;
|
1838
|
+
|
1839
|
+
#pragma unroll
|
1840
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1841
|
+
int sumi_d = 0;
|
1842
|
+
|
1843
|
+
#pragma unroll
|
1844
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1845
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1846
|
+
}
|
1847
|
+
|
1848
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1849
|
+
|
1850
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1851
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
const float2 dm4f = __half22float2(dm4);
|
1855
|
+
|
1856
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1857
|
+
|
1858
|
+
#else
|
1859
|
+
assert(false);
|
1815
1860
|
return 0.0f; // only to satisfy the compiler
|
1816
1861
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
1862
|
}
|
@@ -1842,6 +1887,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
1842
1887
|
|
1843
1888
|
return d*sumf;
|
1844
1889
|
#else
|
1890
|
+
assert(false);
|
1845
1891
|
return 0.0f; // only to satisfy the compiler
|
1846
1892
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
1893
|
}
|
@@ -1873,6 +1919,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
1873
1919
|
return d6 * sumf_d;
|
1874
1920
|
|
1875
1921
|
#else
|
1922
|
+
assert(false);
|
1876
1923
|
return 0.0f; // only to satisfy the compiler
|
1877
1924
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
1925
|
}
|
@@ -2722,6 +2769,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2722
2769
|
return dall * sumf_d - dmin * sumf_m;
|
2723
2770
|
|
2724
2771
|
#else
|
2772
|
+
assert(false);
|
2725
2773
|
return 0.0f; // only to satisfy the compiler
|
2726
2774
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2727
2775
|
|
@@ -2808,18 +2856,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
|
2808
2856
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
2857
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2810
2858
|
|
2811
|
-
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2812
|
-
|
2813
|
-
#pragma unroll
|
2814
|
-
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
-
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
-
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2817
|
-
}
|
2818
|
-
|
2819
2859
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2820
2860
|
|
2821
2861
|
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
-
return vec_dot_q4_K_q8_1_impl_mmq(
|
2862
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2863
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2823
2864
|
}
|
2824
2865
|
|
2825
2866
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2866,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2866
2907
|
u[2*i+1] = q8[4];
|
2867
2908
|
}
|
2868
2909
|
|
2869
|
-
return
|
2910
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2870
2911
|
|
2871
2912
|
#else
|
2872
2913
|
|
@@ -2905,6 +2946,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2905
2946
|
return d * sumf_d;
|
2906
2947
|
|
2907
2948
|
#else
|
2949
|
+
assert(false);
|
2908
2950
|
return 0.0f; // only to satisfy the compiler
|
2909
2951
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2910
2952
|
|
@@ -3008,7 +3050,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
3008
3050
|
|
3009
3051
|
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
3052
|
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
-
return
|
3053
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3054
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
3012
3055
|
}
|
3013
3056
|
|
3014
3057
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -3135,7 +3178,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3135
3178
|
|
3136
3179
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3137
3180
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3138
|
-
static
|
3181
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3139
3182
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3140
3183
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3141
3184
|
|
@@ -3150,7 +3193,6 @@ static __global__ void mul_mat_q(
|
|
3150
3193
|
|
3151
3194
|
const int row_dst_0 = blockIdx.x*mmq_y;
|
3152
3195
|
const int & row_x_0 = row_dst_0;
|
3153
|
-
const int row_dst = row_dst_0 + threadIdx.x;
|
3154
3196
|
|
3155
3197
|
const int col_dst_0 = blockIdx.y*mmq_x;
|
3156
3198
|
const int & col_y_0 = col_dst_0;
|
@@ -3223,11 +3265,7 @@ static __global__ void mul_mat_q(
|
|
3223
3265
|
}
|
3224
3266
|
}
|
3225
3267
|
|
3226
|
-
|
3227
|
-
if (row_dst >= nrows_dst) {
|
3228
|
-
return;
|
3229
|
-
}
|
3230
|
-
|
3268
|
+
#pragma unroll
|
3231
3269
|
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
3270
|
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3233
3271
|
|
@@ -3235,12 +3273,375 @@ static __global__ void mul_mat_q(
|
|
3235
3273
|
return;
|
3236
3274
|
}
|
3237
3275
|
|
3276
|
+
#pragma unroll
|
3238
3277
|
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
-
|
3278
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3279
|
+
|
3280
|
+
if (row_dst >= nrows_dst) {
|
3281
|
+
continue;
|
3282
|
+
}
|
3283
|
+
|
3284
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3240
3285
|
}
|
3241
3286
|
}
|
3242
3287
|
}
|
3243
3288
|
|
3289
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3290
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3291
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3292
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3293
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3294
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3295
|
+
|
3296
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3297
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3298
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3299
|
+
|
3300
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3301
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3302
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3303
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3304
|
+
|
3305
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3306
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3307
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3308
|
+
|
3309
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3310
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3311
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3312
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3313
|
+
|
3314
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3315
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3316
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3317
|
+
#else
|
3318
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3319
|
+
assert(false);
|
3320
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3321
|
+
}
|
3322
|
+
|
3323
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3324
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3325
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3326
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3327
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3328
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3329
|
+
|
3330
|
+
template <bool need_check> static __global__ void
|
3331
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3332
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3333
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3334
|
+
mul_mat_q4_1(
|
3335
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3336
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3337
|
+
|
3338
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3339
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3340
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3341
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3342
|
+
|
3343
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3344
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3345
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3346
|
+
|
3347
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3348
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3349
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3350
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3351
|
+
|
3352
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3353
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3354
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3355
|
+
#else
|
3356
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3357
|
+
assert(false);
|
3358
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3359
|
+
}
|
3360
|
+
|
3361
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3362
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3363
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3364
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3365
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3366
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3367
|
+
|
3368
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3369
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3370
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3371
|
+
|
3372
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3373
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3374
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3375
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3376
|
+
|
3377
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3378
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3379
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3380
|
+
|
3381
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3382
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3383
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3384
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3385
|
+
|
3386
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3387
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3388
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3389
|
+
#else
|
3390
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3391
|
+
assert(false);
|
3392
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3393
|
+
}
|
3394
|
+
|
3395
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3396
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3397
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3398
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3399
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3400
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3401
|
+
|
3402
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3403
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3404
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3405
|
+
|
3406
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3407
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3408
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3409
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3410
|
+
|
3411
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3412
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3413
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3414
|
+
|
3415
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3416
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3417
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3418
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3419
|
+
|
3420
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3421
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3422
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3423
|
+
#else
|
3424
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3425
|
+
assert(false);
|
3426
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3427
|
+
}
|
3428
|
+
|
3429
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3430
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3431
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3432
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3433
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3434
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3435
|
+
|
3436
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3446
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3455
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3464
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3465
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3466
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3468
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3480
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3489
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3499
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3500
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3502
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void
|
3505
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3506
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3507
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3508
|
+
mul_mat_q3_K(
|
3509
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3510
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3511
|
+
|
3512
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3513
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3514
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3515
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3516
|
+
|
3517
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3518
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3520
|
+
|
3521
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3522
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3523
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3524
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3525
|
+
|
3526
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3527
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3528
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3529
|
+
#else
|
3530
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3531
|
+
assert(false);
|
3532
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3533
|
+
}
|
3534
|
+
|
3535
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3536
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3537
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3538
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3539
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3540
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3541
|
+
|
3542
|
+
template <bool need_check> static __global__ void
|
3543
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3544
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3545
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3546
|
+
mul_mat_q4_K(
|
3547
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3548
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3549
|
+
|
3550
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3551
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3552
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3553
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3554
|
+
|
3555
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3556
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3557
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3558
|
+
|
3559
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3560
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3561
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3562
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3563
|
+
|
3564
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3565
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3566
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3567
|
+
#else
|
3568
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3569
|
+
assert(false);
|
3570
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3571
|
+
}
|
3572
|
+
|
3573
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3574
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3575
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3576
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3577
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3578
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3579
|
+
|
3580
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3581
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3583
|
+
|
3584
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3585
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3586
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3587
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3588
|
+
|
3589
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3590
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3591
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3592
|
+
|
3593
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3594
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3595
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3596
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3597
|
+
|
3598
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3599
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3600
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3601
|
+
#else
|
3602
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3603
|
+
assert(false);
|
3604
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3605
|
+
}
|
3606
|
+
|
3607
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3608
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3609
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3610
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3611
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3612
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3613
|
+
|
3614
|
+
template <bool need_check> static __global__ void
|
3615
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3616
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3617
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3618
|
+
mul_mat_q6_K(
|
3619
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3620
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3621
|
+
|
3622
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3623
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3624
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3625
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3626
|
+
|
3627
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3628
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3629
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3630
|
+
|
3631
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3632
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3633
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3634
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3635
|
+
|
3636
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3637
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3638
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3639
|
+
#else
|
3640
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3641
|
+
assert(false);
|
3642
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3643
|
+
}
|
3644
|
+
|
3244
3645
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3245
3646
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3246
3647
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3942,48 +4343,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3942
4343
|
CUDA_CHECK(cudaGetDevice(&id));
|
3943
4344
|
const int compute_capability = g_compute_capabilities[id];
|
3944
4345
|
|
4346
|
+
int mmq_x, mmq_y, nwarps;
|
3945
4347
|
if (compute_capability >= CC_TURING) {
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
3950
|
-
|
3951
|
-
|
3952
|
-
|
3953
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
-
|
3955
|
-
if (nrows_x % mmq_y == 0) {
|
3956
|
-
const bool need_check = false;
|
3957
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
-
} else {
|
3961
|
-
const bool need_check = true;
|
3962
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
-
}
|
4348
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4349
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4350
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4351
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4352
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4353
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4354
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
3966
4355
|
} else {
|
3967
|
-
|
3968
|
-
|
3969
|
-
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
3973
|
-
|
3974
|
-
|
3975
|
-
|
3976
|
-
|
3977
|
-
|
3978
|
-
|
3979
|
-
|
3980
|
-
|
3981
|
-
|
3982
|
-
|
3983
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
-
}
|
4356
|
+
GGML_ASSERT(false);
|
4357
|
+
}
|
4358
|
+
|
4359
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4360
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4361
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4362
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4363
|
+
|
4364
|
+
if (nrows_x % mmq_y == 0) {
|
4365
|
+
const bool need_check = false;
|
4366
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4367
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4368
|
+
} else {
|
4369
|
+
const bool need_check = true;
|
4370
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4371
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3987
4372
|
}
|
3988
4373
|
}
|
3989
4374
|
|
@@ -3995,49 +4380,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3995
4380
|
CUDA_CHECK(cudaGetDevice(&id));
|
3996
4381
|
const int compute_capability = g_compute_capabilities[id];
|
3997
4382
|
|
4383
|
+
int mmq_x, mmq_y, nwarps;
|
3998
4384
|
if (compute_capability >= CC_TURING) {
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4004
|
-
|
4005
|
-
|
4006
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
-
|
4008
|
-
if (nrows_x % mmq_y == 0) {
|
4009
|
-
const bool need_check = false;
|
4010
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
-
} else {
|
4014
|
-
const bool need_check = true;
|
4015
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
-
}
|
4385
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4386
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4387
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4388
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4389
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4390
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4391
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4019
4392
|
} else {
|
4020
|
-
|
4021
|
-
|
4022
|
-
const int nwarps = 8;
|
4023
|
-
|
4024
|
-
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
-
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
-
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
-
|
4029
|
-
if (nrows_x % mmq_y == 0) {
|
4030
|
-
const bool need_check = false;
|
4031
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
-
} else {
|
4035
|
-
const bool need_check = true;
|
4036
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
-
}
|
4393
|
+
GGML_ASSERT(false);
|
4394
|
+
}
|
4040
4395
|
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4398
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4400
|
+
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4404
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4405
|
+
} else {
|
4406
|
+
const bool need_check = true;
|
4407
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4408
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4041
4409
|
}
|
4042
4410
|
}
|
4043
4411
|
|
@@ -4049,48 +4417,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4049
4417
|
CUDA_CHECK(cudaGetDevice(&id));
|
4050
4418
|
const int compute_capability = g_compute_capabilities[id];
|
4051
4419
|
|
4420
|
+
int mmq_x, mmq_y, nwarps;
|
4052
4421
|
if (compute_capability >= CC_TURING) {
|
4053
|
-
|
4054
|
-
|
4055
|
-
|
4056
|
-
|
4057
|
-
|
4058
|
-
|
4059
|
-
|
4060
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
-
|
4062
|
-
if (nrows_x % mmq_y == 0) {
|
4063
|
-
const bool need_check = false;
|
4064
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
-
} else {
|
4068
|
-
const bool need_check = true;
|
4069
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
-
}
|
4422
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4423
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4424
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4425
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4426
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4427
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4428
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4073
4429
|
} else {
|
4074
|
-
|
4075
|
-
|
4076
|
-
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4080
|
-
|
4081
|
-
|
4082
|
-
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4086
|
-
|
4087
|
-
|
4088
|
-
|
4089
|
-
|
4090
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
-
}
|
4430
|
+
GGML_ASSERT(false);
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4434
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4435
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4436
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4437
|
+
|
4438
|
+
if (nrows_x % mmq_y == 0) {
|
4439
|
+
const bool need_check = false;
|
4440
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4441
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4442
|
+
} else {
|
4443
|
+
const bool need_check = true;
|
4444
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4445
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4094
4446
|
}
|
4095
4447
|
}
|
4096
4448
|
|
@@ -4102,48 +4454,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4102
4454
|
CUDA_CHECK(cudaGetDevice(&id));
|
4103
4455
|
const int compute_capability = g_compute_capabilities[id];
|
4104
4456
|
|
4457
|
+
int mmq_x, mmq_y, nwarps;
|
4105
4458
|
if (compute_capability >= CC_TURING) {
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
-
|
4115
|
-
if (nrows_x % mmq_y == 0) {
|
4116
|
-
const bool need_check = false;
|
4117
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
-
} else {
|
4121
|
-
const bool need_check = true;
|
4122
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
-
}
|
4459
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4460
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4461
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4462
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4463
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4464
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4465
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4126
4466
|
} else {
|
4127
|
-
|
4128
|
-
|
4129
|
-
|
4130
|
-
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4135
|
-
|
4136
|
-
|
4137
|
-
|
4138
|
-
|
4139
|
-
|
4140
|
-
|
4141
|
-
|
4142
|
-
|
4143
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
-
}
|
4467
|
+
GGML_ASSERT(false);
|
4468
|
+
}
|
4469
|
+
|
4470
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4471
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4472
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4473
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4474
|
+
|
4475
|
+
if (nrows_x % mmq_y == 0) {
|
4476
|
+
const bool need_check = false;
|
4477
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4478
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4479
|
+
} else {
|
4480
|
+
const bool need_check = true;
|
4481
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4482
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4147
4483
|
}
|
4148
4484
|
}
|
4149
4485
|
|
@@ -4155,48 +4491,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4155
4491
|
CUDA_CHECK(cudaGetDevice(&id));
|
4156
4492
|
const int compute_capability = g_compute_capabilities[id];
|
4157
4493
|
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
4158
4495
|
if (compute_capability >= CC_TURING) {
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
4166
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
-
|
4168
|
-
if (nrows_x % mmq_y == 0) {
|
4169
|
-
const bool need_check = false;
|
4170
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
-
} else {
|
4174
|
-
const bool need_check = true;
|
4175
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
-
}
|
4496
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4179
4503
|
} else {
|
4180
|
-
|
4181
|
-
|
4182
|
-
|
4183
|
-
|
4184
|
-
|
4185
|
-
|
4186
|
-
|
4187
|
-
|
4188
|
-
|
4189
|
-
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
|
4196
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
-
}
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4509
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4511
|
+
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4516
|
+
} else {
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4200
4520
|
}
|
4201
4521
|
}
|
4202
4522
|
|
@@ -4208,48 +4528,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4208
4528
|
CUDA_CHECK(cudaGetDevice(&id));
|
4209
4529
|
const int compute_capability = g_compute_capabilities[id];
|
4210
4530
|
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
4211
4532
|
if (compute_capability >= CC_TURING) {
|
4212
|
-
|
4213
|
-
|
4214
|
-
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
4218
|
-
|
4219
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
-
|
4221
|
-
if (nrows_x % mmq_y == 0) {
|
4222
|
-
const bool need_check = false;
|
4223
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
-
} else {
|
4227
|
-
const bool need_check = true;
|
4228
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
-
}
|
4533
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4232
4540
|
} else {
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4242
|
-
|
4243
|
-
|
4244
|
-
|
4245
|
-
|
4246
|
-
|
4247
|
-
|
4248
|
-
|
4249
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
-
}
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4546
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4548
|
+
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4553
|
+
} else {
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4253
4557
|
}
|
4254
4558
|
}
|
4255
4559
|
|
@@ -4261,48 +4565,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4261
4565
|
CUDA_CHECK(cudaGetDevice(&id));
|
4262
4566
|
const int compute_capability = g_compute_capabilities[id];
|
4263
4567
|
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4264
4569
|
if (compute_capability >= CC_TURING) {
|
4265
|
-
|
4266
|
-
|
4267
|
-
|
4268
|
-
|
4269
|
-
|
4270
|
-
|
4271
|
-
|
4272
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
-
|
4274
|
-
if (nrows_x % mmq_y == 0) {
|
4275
|
-
const bool need_check = false;
|
4276
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
-
} else {
|
4280
|
-
const bool need_check = true;
|
4281
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
-
}
|
4570
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4285
4577
|
} else {
|
4286
|
-
|
4287
|
-
|
4288
|
-
|
4289
|
-
|
4290
|
-
|
4291
|
-
|
4292
|
-
|
4293
|
-
|
4294
|
-
|
4295
|
-
|
4296
|
-
|
4297
|
-
|
4298
|
-
|
4299
|
-
|
4300
|
-
|
4301
|
-
|
4302
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
-
}
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4583
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4585
|
+
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4590
|
+
} else {
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4306
4594
|
}
|
4307
4595
|
}
|
4308
4596
|
|
@@ -4314,48 +4602,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4314
4602
|
CUDA_CHECK(cudaGetDevice(&id));
|
4315
4603
|
const int compute_capability = g_compute_capabilities[id];
|
4316
4604
|
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4317
4606
|
if (compute_capability >= CC_TURING) {
|
4318
|
-
|
4319
|
-
|
4320
|
-
|
4321
|
-
|
4322
|
-
|
4323
|
-
|
4324
|
-
|
4325
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
-
|
4327
|
-
if (nrows_x % mmq_y == 0) {
|
4328
|
-
const bool need_check = false;
|
4329
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
-
} else {
|
4333
|
-
const bool need_check = true;
|
4334
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
-
}
|
4607
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4338
4614
|
} else {
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
|
4355
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
-
}
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4620
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4622
|
+
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4627
|
+
} else {
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4631
|
}
|
4360
4632
|
}
|
4361
4633
|
|
@@ -4367,48 +4639,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4367
4639
|
CUDA_CHECK(cudaGetDevice(&id));
|
4368
4640
|
const int compute_capability = g_compute_capabilities[id];
|
4369
4641
|
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4370
4643
|
if (compute_capability >= CC_TURING) {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
|
4377
|
-
|
4378
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
-
|
4380
|
-
if (nrows_x % mmq_y == 0) {
|
4381
|
-
const bool need_check = false;
|
4382
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
-
} else {
|
4386
|
-
const bool need_check = true;
|
4387
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
-
}
|
4644
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4391
4651
|
} else {
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4397
|
-
|
4398
|
-
|
4399
|
-
|
4400
|
-
|
4401
|
-
|
4402
|
-
|
4403
|
-
|
4404
|
-
|
4405
|
-
|
4406
|
-
|
4407
|
-
|
4408
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
-
}
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4657
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4659
|
+
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4664
|
+
} else {
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4412
4668
|
}
|
4413
4669
|
}
|
4414
4670
|
|
@@ -4420,48 +4676,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4420
4676
|
CUDA_CHECK(cudaGetDevice(&id));
|
4421
4677
|
const int compute_capability = g_compute_capabilities[id];
|
4422
4678
|
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4423
4680
|
if (compute_capability >= CC_TURING) {
|
4424
|
-
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
-
|
4433
|
-
if (nrows_x % mmq_y == 0) {
|
4434
|
-
const bool need_check = false;
|
4435
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
-
} else {
|
4439
|
-
const bool need_check = true;
|
4440
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
-
}
|
4681
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4444
4688
|
} else {
|
4445
|
-
|
4446
|
-
|
4447
|
-
|
4448
|
-
|
4449
|
-
|
4450
|
-
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4456
|
-
|
4457
|
-
|
4458
|
-
|
4459
|
-
|
4460
|
-
|
4461
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
-
}
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4694
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4696
|
+
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4701
|
+
} else {
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4465
4705
|
}
|
4466
4706
|
}
|
4467
4707
|
|
@@ -6229,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6229
6469
|
func(tensor->src[0], tensor->src[1], tensor);
|
6230
6470
|
return true;
|
6231
6471
|
}
|
6472
|
+
|
6473
|
+
int ggml_cuda_get_device_count() {
|
6474
|
+
int device_count;
|
6475
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6476
|
+
return device_count;
|
6477
|
+
}
|
6478
|
+
|
6479
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6480
|
+
cudaDeviceProp prop;
|
6481
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6482
|
+
snprintf(description, description_size, "%s", prop.name);
|
6483
|
+
}
|