llama_cpp 0.3.7 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +36 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +680 -428
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +73 -128
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +176 -64
- data/ext/llama_cpp/src/llama.h +3 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -1399,6 +1399,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1399
1399
|
// second part effectively subtracts 8 from each quant value
|
1400
1400
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1401
1401
|
#else
|
1402
|
+
assert(false);
|
1402
1403
|
return 0.0f; // only to satisfy the compiler
|
1403
1404
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1404
1405
|
}
|
@@ -1436,6 +1437,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1436
1437
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1437
1438
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1438
1439
|
#else
|
1440
|
+
assert(false);
|
1439
1441
|
return 0.0f; // only to satisfy the compiler
|
1440
1442
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1441
1443
|
}
|
@@ -1471,6 +1473,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1471
1473
|
// second part effectively subtracts 16 from each quant value
|
1472
1474
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1473
1475
|
#else
|
1476
|
+
assert(false);
|
1474
1477
|
return 0.0f; // only to satisfy the compiler
|
1475
1478
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1476
1479
|
}
|
@@ -1516,6 +1519,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1516
1519
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1517
1520
|
|
1518
1521
|
#else
|
1522
|
+
assert(false);
|
1519
1523
|
return 0.0f; // only to satisfy the compiler
|
1520
1524
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1521
1525
|
}
|
@@ -1537,6 +1541,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
1537
1541
|
|
1538
1542
|
return d8_0*d8_1 * sumi;
|
1539
1543
|
#else
|
1544
|
+
assert(false);
|
1540
1545
|
return 0.0f; // only to satisfy the compiler
|
1541
1546
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1542
1547
|
}
|
@@ -1567,6 +1572,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1567
1572
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1568
1573
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1569
1574
|
#else
|
1575
|
+
assert(false);
|
1570
1576
|
return 0.0f; // only to satisfy the compiler
|
1571
1577
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1572
1578
|
}
|
@@ -1602,6 +1608,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
1602
1608
|
|
1603
1609
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
1610
|
#else
|
1611
|
+
assert(false);
|
1605
1612
|
return 0.0f; // only to satisfy the compiler
|
1606
1613
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
1614
|
}
|
@@ -1639,6 +1646,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
1639
1646
|
|
1640
1647
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
1648
|
#else
|
1649
|
+
assert(false);
|
1642
1650
|
return 0.0f; // only to satisfy the compiler
|
1643
1651
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
1652
|
}
|
@@ -1679,6 +1687,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
1679
1687
|
|
1680
1688
|
return d3 * sumf;
|
1681
1689
|
#else
|
1690
|
+
assert(false);
|
1682
1691
|
return 0.0f; // only to satisfy the compiler
|
1683
1692
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
1693
|
}
|
@@ -1704,6 +1713,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
1704
1713
|
|
1705
1714
|
return d3*d8 * sumi;
|
1706
1715
|
#else
|
1716
|
+
assert(false);
|
1707
1717
|
return 0.0f; // only to satisfy the compiler
|
1708
1718
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
1719
|
}
|
@@ -1737,12 +1747,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
1737
1747
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
1748
|
|
1739
1749
|
#else
|
1750
|
+
assert(false);
|
1740
1751
|
return 0.0f; // only to satisfy the compiler
|
1741
1752
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
1753
|
}
|
1743
1754
|
|
1744
1755
|
// contiguous u/y values
|
1745
|
-
// also used for q5_K
|
1746
1756
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
1757
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
1758
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
@@ -1752,19 +1762,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1752
1762
|
float sumf_m = 0.0f;
|
1753
1763
|
|
1754
1764
|
#pragma unroll
|
1755
|
-
for (int
|
1765
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1756
1766
|
int sumi_d = 0;
|
1757
1767
|
|
1758
1768
|
#pragma unroll
|
1759
|
-
for (int
|
1760
|
-
sumi_d = __dp4a(v[
|
1761
|
-
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1769
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1770
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1762
1771
|
}
|
1763
1772
|
|
1764
|
-
const float2 ds8f = __half22float2(ds8[
|
1773
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1765
1774
|
|
1766
|
-
sumf_d += ds8f.x * (sc[
|
1767
|
-
sumf_m += ds8f.y * m[
|
1775
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1776
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1768
1777
|
}
|
1769
1778
|
|
1770
1779
|
const float2 dm4f = __half22float2(dm4);
|
@@ -1772,6 +1781,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1772
1781
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
1782
|
|
1774
1783
|
#else
|
1784
|
+
assert(false);
|
1775
1785
|
return 0.0f; // only to satisfy the compiler
|
1776
1786
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
1787
|
}
|
@@ -1780,7 +1790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1780
1790
|
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
1791
|
|
1782
1792
|
// contiguous v/x values
|
1783
|
-
static __device__ __forceinline__ float
|
1793
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1784
1794
|
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
1795
|
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
1796
|
|
@@ -1812,6 +1822,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
|
1812
1822
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
1823
|
|
1814
1824
|
#else
|
1825
|
+
assert(false);
|
1826
|
+
return 0.0f; // only to satisfy the compiler
|
1827
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
// contiguous u/y values
|
1831
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1832
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1833
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1834
|
+
|
1835
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1836
|
+
float sumf_d = 0.0f;
|
1837
|
+
float sumf_m = 0.0f;
|
1838
|
+
|
1839
|
+
#pragma unroll
|
1840
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1841
|
+
int sumi_d = 0;
|
1842
|
+
|
1843
|
+
#pragma unroll
|
1844
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1845
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1846
|
+
}
|
1847
|
+
|
1848
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1849
|
+
|
1850
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1851
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
const float2 dm4f = __half22float2(dm4);
|
1855
|
+
|
1856
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1857
|
+
|
1858
|
+
#else
|
1859
|
+
assert(false);
|
1815
1860
|
return 0.0f; // only to satisfy the compiler
|
1816
1861
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
1862
|
}
|
@@ -1842,6 +1887,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
1842
1887
|
|
1843
1888
|
return d*sumf;
|
1844
1889
|
#else
|
1890
|
+
assert(false);
|
1845
1891
|
return 0.0f; // only to satisfy the compiler
|
1846
1892
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
1893
|
}
|
@@ -1873,6 +1919,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
1873
1919
|
return d6 * sumf_d;
|
1874
1920
|
|
1875
1921
|
#else
|
1922
|
+
assert(false);
|
1876
1923
|
return 0.0f; // only to satisfy the compiler
|
1877
1924
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
1925
|
}
|
@@ -2722,6 +2769,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2722
2769
|
return dall * sumf_d - dmin * sumf_m;
|
2723
2770
|
|
2724
2771
|
#else
|
2772
|
+
assert(false);
|
2725
2773
|
return 0.0f; // only to satisfy the compiler
|
2726
2774
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2727
2775
|
|
@@ -2808,18 +2856,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
|
2808
2856
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
2857
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2810
2858
|
|
2811
|
-
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2812
|
-
|
2813
|
-
#pragma unroll
|
2814
|
-
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
-
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
-
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2817
|
-
}
|
2818
|
-
|
2819
2859
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2820
2860
|
|
2821
2861
|
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
-
return vec_dot_q4_K_q8_1_impl_mmq(
|
2862
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2863
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2823
2864
|
}
|
2824
2865
|
|
2825
2866
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2866,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2866
2907
|
u[2*i+1] = q8[4];
|
2867
2908
|
}
|
2868
2909
|
|
2869
|
-
return
|
2910
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2870
2911
|
|
2871
2912
|
#else
|
2872
2913
|
|
@@ -2905,6 +2946,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2905
2946
|
return d * sumf_d;
|
2906
2947
|
|
2907
2948
|
#else
|
2949
|
+
assert(false);
|
2908
2950
|
return 0.0f; // only to satisfy the compiler
|
2909
2951
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2910
2952
|
|
@@ -3008,7 +3050,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
3008
3050
|
|
3009
3051
|
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
3052
|
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
-
return
|
3053
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3054
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
3012
3055
|
}
|
3013
3056
|
|
3014
3057
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -3135,7 +3178,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3135
3178
|
|
3136
3179
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3137
3180
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3138
|
-
static
|
3181
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3139
3182
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3140
3183
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3141
3184
|
|
@@ -3150,7 +3193,6 @@ static __global__ void mul_mat_q(
|
|
3150
3193
|
|
3151
3194
|
const int row_dst_0 = blockIdx.x*mmq_y;
|
3152
3195
|
const int & row_x_0 = row_dst_0;
|
3153
|
-
const int row_dst = row_dst_0 + threadIdx.x;
|
3154
3196
|
|
3155
3197
|
const int col_dst_0 = blockIdx.y*mmq_x;
|
3156
3198
|
const int & col_y_0 = col_dst_0;
|
@@ -3223,11 +3265,7 @@ static __global__ void mul_mat_q(
|
|
3223
3265
|
}
|
3224
3266
|
}
|
3225
3267
|
|
3226
|
-
|
3227
|
-
if (row_dst >= nrows_dst) {
|
3228
|
-
return;
|
3229
|
-
}
|
3230
|
-
|
3268
|
+
#pragma unroll
|
3231
3269
|
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
3270
|
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3233
3271
|
|
@@ -3235,12 +3273,375 @@ static __global__ void mul_mat_q(
|
|
3235
3273
|
return;
|
3236
3274
|
}
|
3237
3275
|
|
3276
|
+
#pragma unroll
|
3238
3277
|
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
-
|
3278
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3279
|
+
|
3280
|
+
if (row_dst >= nrows_dst) {
|
3281
|
+
continue;
|
3282
|
+
}
|
3283
|
+
|
3284
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3240
3285
|
}
|
3241
3286
|
}
|
3242
3287
|
}
|
3243
3288
|
|
3289
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3290
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3291
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3292
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3293
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3294
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3295
|
+
|
3296
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3297
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3298
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3299
|
+
|
3300
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3301
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3302
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3303
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3304
|
+
|
3305
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3306
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3307
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3308
|
+
|
3309
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3310
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3311
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3312
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3313
|
+
|
3314
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3315
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3316
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3317
|
+
#else
|
3318
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3319
|
+
assert(false);
|
3320
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3321
|
+
}
|
3322
|
+
|
3323
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3324
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3325
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3326
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3327
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3328
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3329
|
+
|
3330
|
+
template <bool need_check> static __global__ void
|
3331
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3332
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3333
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3334
|
+
mul_mat_q4_1(
|
3335
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3336
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3337
|
+
|
3338
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3339
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3340
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3341
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3342
|
+
|
3343
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3344
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3345
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3346
|
+
|
3347
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3348
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3349
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3350
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3351
|
+
|
3352
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3353
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3354
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3355
|
+
#else
|
3356
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3357
|
+
assert(false);
|
3358
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3359
|
+
}
|
3360
|
+
|
3361
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3362
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3363
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3364
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3365
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3366
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3367
|
+
|
3368
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3369
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3370
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3371
|
+
|
3372
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3373
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3374
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3375
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3376
|
+
|
3377
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3378
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3379
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3380
|
+
|
3381
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3382
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3383
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3384
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3385
|
+
|
3386
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3387
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3388
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3389
|
+
#else
|
3390
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3391
|
+
assert(false);
|
3392
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3393
|
+
}
|
3394
|
+
|
3395
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3396
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3397
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3398
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3399
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3400
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3401
|
+
|
3402
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3403
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3404
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3405
|
+
|
3406
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3407
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3408
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3409
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3410
|
+
|
3411
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3412
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3413
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3414
|
+
|
3415
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3416
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3417
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3418
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3419
|
+
|
3420
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3421
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3422
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3423
|
+
#else
|
3424
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3425
|
+
assert(false);
|
3426
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3427
|
+
}
|
3428
|
+
|
3429
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3430
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3431
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3432
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3433
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3434
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3435
|
+
|
3436
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3446
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3455
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3464
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3465
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3466
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3468
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3480
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3489
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3499
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3500
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3502
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void
|
3505
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3506
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3507
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3508
|
+
mul_mat_q3_K(
|
3509
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3510
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3511
|
+
|
3512
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3513
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3514
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3515
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3516
|
+
|
3517
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3518
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3520
|
+
|
3521
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3522
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3523
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3524
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3525
|
+
|
3526
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3527
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3528
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3529
|
+
#else
|
3530
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3531
|
+
assert(false);
|
3532
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3533
|
+
}
|
3534
|
+
|
3535
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3536
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3537
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3538
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3539
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3540
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3541
|
+
|
3542
|
+
template <bool need_check> static __global__ void
|
3543
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3544
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3545
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3546
|
+
mul_mat_q4_K(
|
3547
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3548
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3549
|
+
|
3550
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3551
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3552
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3553
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3554
|
+
|
3555
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3556
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3557
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3558
|
+
|
3559
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3560
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3561
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3562
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3563
|
+
|
3564
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3565
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3566
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3567
|
+
#else
|
3568
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3569
|
+
assert(false);
|
3570
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3571
|
+
}
|
3572
|
+
|
3573
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3574
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3575
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3576
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3577
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3578
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3579
|
+
|
3580
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3581
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3583
|
+
|
3584
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3585
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3586
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3587
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3588
|
+
|
3589
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3590
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3591
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3592
|
+
|
3593
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3594
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3595
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3596
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3597
|
+
|
3598
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3599
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3600
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3601
|
+
#else
|
3602
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3603
|
+
assert(false);
|
3604
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3605
|
+
}
|
3606
|
+
|
3607
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3608
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3609
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3610
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3611
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3612
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3613
|
+
|
3614
|
+
template <bool need_check> static __global__ void
|
3615
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3616
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3617
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3618
|
+
mul_mat_q6_K(
|
3619
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3620
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3621
|
+
|
3622
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3623
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3624
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3625
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3626
|
+
|
3627
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3628
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3629
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3630
|
+
|
3631
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3632
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3633
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3634
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3635
|
+
|
3636
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3637
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3638
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3639
|
+
#else
|
3640
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3641
|
+
assert(false);
|
3642
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3643
|
+
}
|
3644
|
+
|
3244
3645
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3245
3646
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3246
3647
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3942,48 +4343,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3942
4343
|
CUDA_CHECK(cudaGetDevice(&id));
|
3943
4344
|
const int compute_capability = g_compute_capabilities[id];
|
3944
4345
|
|
4346
|
+
int mmq_x, mmq_y, nwarps;
|
3945
4347
|
if (compute_capability >= CC_TURING) {
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
3950
|
-
|
3951
|
-
|
3952
|
-
|
3953
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
-
|
3955
|
-
if (nrows_x % mmq_y == 0) {
|
3956
|
-
const bool need_check = false;
|
3957
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
-
} else {
|
3961
|
-
const bool need_check = true;
|
3962
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
-
}
|
4348
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4349
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4350
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4351
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4352
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4353
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4354
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
3966
4355
|
} else {
|
3967
|
-
|
3968
|
-
|
3969
|
-
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
3973
|
-
|
3974
|
-
|
3975
|
-
|
3976
|
-
|
3977
|
-
|
3978
|
-
|
3979
|
-
|
3980
|
-
|
3981
|
-
|
3982
|
-
|
3983
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
-
}
|
4356
|
+
GGML_ASSERT(false);
|
4357
|
+
}
|
4358
|
+
|
4359
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4360
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4361
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4362
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4363
|
+
|
4364
|
+
if (nrows_x % mmq_y == 0) {
|
4365
|
+
const bool need_check = false;
|
4366
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4367
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4368
|
+
} else {
|
4369
|
+
const bool need_check = true;
|
4370
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4371
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3987
4372
|
}
|
3988
4373
|
}
|
3989
4374
|
|
@@ -3995,49 +4380,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3995
4380
|
CUDA_CHECK(cudaGetDevice(&id));
|
3996
4381
|
const int compute_capability = g_compute_capabilities[id];
|
3997
4382
|
|
4383
|
+
int mmq_x, mmq_y, nwarps;
|
3998
4384
|
if (compute_capability >= CC_TURING) {
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4004
|
-
|
4005
|
-
|
4006
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
-
|
4008
|
-
if (nrows_x % mmq_y == 0) {
|
4009
|
-
const bool need_check = false;
|
4010
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
-
} else {
|
4014
|
-
const bool need_check = true;
|
4015
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
-
}
|
4385
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4386
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4387
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4388
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4389
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4390
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4391
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4019
4392
|
} else {
|
4020
|
-
|
4021
|
-
|
4022
|
-
const int nwarps = 8;
|
4023
|
-
|
4024
|
-
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
-
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
-
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
-
|
4029
|
-
if (nrows_x % mmq_y == 0) {
|
4030
|
-
const bool need_check = false;
|
4031
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
-
} else {
|
4035
|
-
const bool need_check = true;
|
4036
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
-
}
|
4393
|
+
GGML_ASSERT(false);
|
4394
|
+
}
|
4040
4395
|
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4398
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4400
|
+
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4404
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4405
|
+
} else {
|
4406
|
+
const bool need_check = true;
|
4407
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4408
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4041
4409
|
}
|
4042
4410
|
}
|
4043
4411
|
|
@@ -4049,48 +4417,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4049
4417
|
CUDA_CHECK(cudaGetDevice(&id));
|
4050
4418
|
const int compute_capability = g_compute_capabilities[id];
|
4051
4419
|
|
4420
|
+
int mmq_x, mmq_y, nwarps;
|
4052
4421
|
if (compute_capability >= CC_TURING) {
|
4053
|
-
|
4054
|
-
|
4055
|
-
|
4056
|
-
|
4057
|
-
|
4058
|
-
|
4059
|
-
|
4060
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
-
|
4062
|
-
if (nrows_x % mmq_y == 0) {
|
4063
|
-
const bool need_check = false;
|
4064
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
-
} else {
|
4068
|
-
const bool need_check = true;
|
4069
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
-
}
|
4422
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4423
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4424
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4425
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4426
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4427
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4428
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4073
4429
|
} else {
|
4074
|
-
|
4075
|
-
|
4076
|
-
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4080
|
-
|
4081
|
-
|
4082
|
-
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4086
|
-
|
4087
|
-
|
4088
|
-
|
4089
|
-
|
4090
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
-
}
|
4430
|
+
GGML_ASSERT(false);
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4434
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4435
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4436
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4437
|
+
|
4438
|
+
if (nrows_x % mmq_y == 0) {
|
4439
|
+
const bool need_check = false;
|
4440
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4441
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4442
|
+
} else {
|
4443
|
+
const bool need_check = true;
|
4444
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4445
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4094
4446
|
}
|
4095
4447
|
}
|
4096
4448
|
|
@@ -4102,48 +4454,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4102
4454
|
CUDA_CHECK(cudaGetDevice(&id));
|
4103
4455
|
const int compute_capability = g_compute_capabilities[id];
|
4104
4456
|
|
4457
|
+
int mmq_x, mmq_y, nwarps;
|
4105
4458
|
if (compute_capability >= CC_TURING) {
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
-
|
4115
|
-
if (nrows_x % mmq_y == 0) {
|
4116
|
-
const bool need_check = false;
|
4117
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
-
} else {
|
4121
|
-
const bool need_check = true;
|
4122
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
-
}
|
4459
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4460
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4461
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4462
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4463
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4464
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4465
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4126
4466
|
} else {
|
4127
|
-
|
4128
|
-
|
4129
|
-
|
4130
|
-
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4135
|
-
|
4136
|
-
|
4137
|
-
|
4138
|
-
|
4139
|
-
|
4140
|
-
|
4141
|
-
|
4142
|
-
|
4143
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
-
}
|
4467
|
+
GGML_ASSERT(false);
|
4468
|
+
}
|
4469
|
+
|
4470
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4471
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4472
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4473
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4474
|
+
|
4475
|
+
if (nrows_x % mmq_y == 0) {
|
4476
|
+
const bool need_check = false;
|
4477
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4478
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4479
|
+
} else {
|
4480
|
+
const bool need_check = true;
|
4481
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4482
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4147
4483
|
}
|
4148
4484
|
}
|
4149
4485
|
|
@@ -4155,48 +4491,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4155
4491
|
CUDA_CHECK(cudaGetDevice(&id));
|
4156
4492
|
const int compute_capability = g_compute_capabilities[id];
|
4157
4493
|
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
4158
4495
|
if (compute_capability >= CC_TURING) {
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
4166
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
-
|
4168
|
-
if (nrows_x % mmq_y == 0) {
|
4169
|
-
const bool need_check = false;
|
4170
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
-
} else {
|
4174
|
-
const bool need_check = true;
|
4175
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
-
}
|
4496
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4179
4503
|
} else {
|
4180
|
-
|
4181
|
-
|
4182
|
-
|
4183
|
-
|
4184
|
-
|
4185
|
-
|
4186
|
-
|
4187
|
-
|
4188
|
-
|
4189
|
-
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
|
4196
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
-
}
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4509
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4511
|
+
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4516
|
+
} else {
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4200
4520
|
}
|
4201
4521
|
}
|
4202
4522
|
|
@@ -4208,48 +4528,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4208
4528
|
CUDA_CHECK(cudaGetDevice(&id));
|
4209
4529
|
const int compute_capability = g_compute_capabilities[id];
|
4210
4530
|
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
4211
4532
|
if (compute_capability >= CC_TURING) {
|
4212
|
-
|
4213
|
-
|
4214
|
-
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
4218
|
-
|
4219
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
-
|
4221
|
-
if (nrows_x % mmq_y == 0) {
|
4222
|
-
const bool need_check = false;
|
4223
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
-
} else {
|
4227
|
-
const bool need_check = true;
|
4228
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
-
}
|
4533
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4232
4540
|
} else {
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4242
|
-
|
4243
|
-
|
4244
|
-
|
4245
|
-
|
4246
|
-
|
4247
|
-
|
4248
|
-
|
4249
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
-
}
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4546
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4548
|
+
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4553
|
+
} else {
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4253
4557
|
}
|
4254
4558
|
}
|
4255
4559
|
|
@@ -4261,48 +4565,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4261
4565
|
CUDA_CHECK(cudaGetDevice(&id));
|
4262
4566
|
const int compute_capability = g_compute_capabilities[id];
|
4263
4567
|
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4264
4569
|
if (compute_capability >= CC_TURING) {
|
4265
|
-
|
4266
|
-
|
4267
|
-
|
4268
|
-
|
4269
|
-
|
4270
|
-
|
4271
|
-
|
4272
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
-
|
4274
|
-
if (nrows_x % mmq_y == 0) {
|
4275
|
-
const bool need_check = false;
|
4276
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
-
} else {
|
4280
|
-
const bool need_check = true;
|
4281
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
-
}
|
4570
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4285
4577
|
} else {
|
4286
|
-
|
4287
|
-
|
4288
|
-
|
4289
|
-
|
4290
|
-
|
4291
|
-
|
4292
|
-
|
4293
|
-
|
4294
|
-
|
4295
|
-
|
4296
|
-
|
4297
|
-
|
4298
|
-
|
4299
|
-
|
4300
|
-
|
4301
|
-
|
4302
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
-
}
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4583
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4585
|
+
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4590
|
+
} else {
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4306
4594
|
}
|
4307
4595
|
}
|
4308
4596
|
|
@@ -4314,48 +4602,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4314
4602
|
CUDA_CHECK(cudaGetDevice(&id));
|
4315
4603
|
const int compute_capability = g_compute_capabilities[id];
|
4316
4604
|
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4317
4606
|
if (compute_capability >= CC_TURING) {
|
4318
|
-
|
4319
|
-
|
4320
|
-
|
4321
|
-
|
4322
|
-
|
4323
|
-
|
4324
|
-
|
4325
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
-
|
4327
|
-
if (nrows_x % mmq_y == 0) {
|
4328
|
-
const bool need_check = false;
|
4329
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
-
} else {
|
4333
|
-
const bool need_check = true;
|
4334
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
-
}
|
4607
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4338
4614
|
} else {
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
|
4355
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
-
}
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4620
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4622
|
+
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4627
|
+
} else {
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4631
|
}
|
4360
4632
|
}
|
4361
4633
|
|
@@ -4367,48 +4639,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4367
4639
|
CUDA_CHECK(cudaGetDevice(&id));
|
4368
4640
|
const int compute_capability = g_compute_capabilities[id];
|
4369
4641
|
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4370
4643
|
if (compute_capability >= CC_TURING) {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
|
4377
|
-
|
4378
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
-
|
4380
|
-
if (nrows_x % mmq_y == 0) {
|
4381
|
-
const bool need_check = false;
|
4382
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
-
} else {
|
4386
|
-
const bool need_check = true;
|
4387
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
-
}
|
4644
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4391
4651
|
} else {
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4397
|
-
|
4398
|
-
|
4399
|
-
|
4400
|
-
|
4401
|
-
|
4402
|
-
|
4403
|
-
|
4404
|
-
|
4405
|
-
|
4406
|
-
|
4407
|
-
|
4408
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
-
}
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4657
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4659
|
+
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4664
|
+
} else {
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4412
4668
|
}
|
4413
4669
|
}
|
4414
4670
|
|
@@ -4420,48 +4676,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4420
4676
|
CUDA_CHECK(cudaGetDevice(&id));
|
4421
4677
|
const int compute_capability = g_compute_capabilities[id];
|
4422
4678
|
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4423
4680
|
if (compute_capability >= CC_TURING) {
|
4424
|
-
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
-
|
4433
|
-
if (nrows_x % mmq_y == 0) {
|
4434
|
-
const bool need_check = false;
|
4435
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
-
} else {
|
4439
|
-
const bool need_check = true;
|
4440
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
-
}
|
4681
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4444
4688
|
} else {
|
4445
|
-
|
4446
|
-
|
4447
|
-
|
4448
|
-
|
4449
|
-
|
4450
|
-
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4456
|
-
|
4457
|
-
|
4458
|
-
|
4459
|
-
|
4460
|
-
|
4461
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
-
}
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4694
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4696
|
+
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4701
|
+
} else {
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4465
4705
|
}
|
4466
4706
|
}
|
4467
4707
|
|
@@ -6229,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6229
6469
|
func(tensor->src[0], tensor->src[1], tensor);
|
6230
6470
|
return true;
|
6231
6471
|
}
|
6472
|
+
|
6473
|
+
int ggml_cuda_get_device_count() {
|
6474
|
+
int device_count;
|
6475
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6476
|
+
return device_count;
|
6477
|
+
}
|
6478
|
+
|
6479
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6480
|
+
cudaDeviceProp prop;
|
6481
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6482
|
+
snprintf(description, description_size, "%s", prop.name);
|
6483
|
+
}
|