llama_cpp 0.3.6 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +8 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1165 -721
- data/ext/llama_cpp/src/ggml-metal.m +39 -18
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +214 -146
- data/ext/llama_cpp/src/llama.h +18 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
16
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
#define CC_TURING 700
|
17
18
|
|
18
19
|
#if defined(_MSC_VER)
|
19
20
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
262
263
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
263
264
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
264
265
|
|
265
|
-
#ifndef GGML_CUDA_MMQ_Y
|
266
|
-
#define GGML_CUDA_MMQ_Y 64
|
267
|
-
#endif // GGML_CUDA_MMQ_Y
|
268
|
-
|
269
266
|
// dmmv = dequantize_mul_mat_vec
|
270
267
|
#ifndef GGML_CUDA_DMMV_X
|
271
268
|
#define GGML_CUDA_DMMV_X 32
|
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
|
|
285
282
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
286
283
|
};
|
287
284
|
|
285
|
+
static int g_device_count = -1;
|
286
|
+
static int g_main_device = 0;
|
287
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
|
+
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
+
static bool g_mul_mat_q = false;
|
290
|
+
|
291
|
+
static void * g_scratch_buffer = nullptr;
|
292
|
+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
293
|
+
static size_t g_scratch_offset = 0;
|
294
|
+
|
295
|
+
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
296
|
+
|
297
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
298
|
+
|
288
299
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
289
300
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
290
301
|
|
@@ -1383,8 +1394,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1383
1394
|
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
1395
|
}
|
1385
1396
|
|
1397
|
+
const float2 ds8f = __half22float2(ds8);
|
1398
|
+
|
1386
1399
|
// second part effectively subtracts 8 from each quant value
|
1387
|
-
return d4 * (sumi *
|
1400
|
+
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1388
1401
|
#else
|
1389
1402
|
return 0.0f; // only to satisfy the compiler
|
1390
1403
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1410,12 +1423,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1410
1423
|
}
|
1411
1424
|
|
1412
1425
|
#ifdef GGML_CUDA_F16
|
1413
|
-
const
|
1414
|
-
const float d4d8 =
|
1415
|
-
const float m4s8 =
|
1426
|
+
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
1427
|
+
const float d4d8 = tmp.x;
|
1428
|
+
const float m4s8 = tmp.y;
|
1416
1429
|
#else
|
1417
|
-
const
|
1418
|
-
const
|
1430
|
+
const float2 dm4f = __half22float2(dm4);
|
1431
|
+
const float2 ds8f = __half22float2(ds8);
|
1432
|
+
const float d4d8 = dm4f.x * ds8f.x;
|
1433
|
+
const float m4s8 = dm4f.y * ds8f.y;
|
1419
1434
|
#endif // GGML_CUDA_F16
|
1420
1435
|
|
1421
1436
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
@@ -1434,6 +1449,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1434
1449
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1435
1450
|
int sumi = 0;
|
1436
1451
|
|
1452
|
+
#pragma unroll
|
1437
1453
|
for (int i = 0; i < vdr; ++i) {
|
1438
1454
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
1455
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1450,8 +1466,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1450
1466
|
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
1467
|
}
|
1452
1468
|
|
1469
|
+
const float2 ds8f = __half22float2(ds8);
|
1470
|
+
|
1453
1471
|
// second part effectively subtracts 16 from each quant value
|
1454
|
-
return d5 * (sumi*
|
1472
|
+
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1455
1473
|
#else
|
1456
1474
|
return 0.0f; // only to satisfy the compiler
|
1457
1475
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1466,6 +1484,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1466
1484
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1467
1485
|
int sumi = 0;
|
1468
1486
|
|
1487
|
+
#pragma unroll
|
1469
1488
|
for (int i = 0; i < vdr; ++i) {
|
1470
1489
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
1490
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1483,12 +1502,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1483
1502
|
}
|
1484
1503
|
|
1485
1504
|
#ifdef GGML_CUDA_F16
|
1486
|
-
const
|
1487
|
-
const float d5d8 =
|
1488
|
-
const float m5s8 =
|
1505
|
+
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
1506
|
+
const float d5d8 = tmp.x;
|
1507
|
+
const float m5s8 = tmp.y;
|
1489
1508
|
#else
|
1490
|
-
const
|
1491
|
-
const
|
1509
|
+
const float2 dm5f = __half22float2(dm5);
|
1510
|
+
const float2 ds8f = __half22float2(ds8);
|
1511
|
+
const float d5d8 = dm5f.x * ds8f.x;
|
1512
|
+
const float m5s8 = dm5f.y * ds8f.y;
|
1492
1513
|
#endif // GGML_CUDA_F16
|
1493
1514
|
|
1494
1515
|
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
@@ -1503,17 +1524,18 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1503
1524
|
#define VDR_Q8_0_Q8_1_MMQ 8
|
1504
1525
|
|
1505
1526
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
-
const int * v, const int * u, const float & d8_0, const
|
1527
|
+
const int * v, const int * u, const float & d8_0, const float & d8_1) {
|
1507
1528
|
|
1508
1529
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
1530
|
int sumi = 0;
|
1510
1531
|
|
1532
|
+
#pragma unroll
|
1511
1533
|
for (int i = 0; i < vdr; ++i) {
|
1512
1534
|
// SIMD dot product of quantized values
|
1513
1535
|
sumi = __dp4a(v[i], u[i], sumi);
|
1514
1536
|
}
|
1515
1537
|
|
1516
|
-
return
|
1538
|
+
return d8_0*d8_1 * sumi;
|
1517
1539
|
#else
|
1518
1540
|
return 0.0f; // only to satisfy the compiler
|
1519
1541
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1525,18 +1547,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1525
1547
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1526
1548
|
int sumi = 0;
|
1527
1549
|
|
1550
|
+
#pragma unroll
|
1528
1551
|
for (int i = 0; i < vdr; ++i) {
|
1529
1552
|
// SIMD dot product of quantized values
|
1530
1553
|
sumi = __dp4a(v[i], u[i], sumi);
|
1531
1554
|
}
|
1532
1555
|
|
1533
1556
|
#ifdef GGML_CUDA_F16
|
1534
|
-
const
|
1535
|
-
const float d8d8 =
|
1536
|
-
const float m8s8 =
|
1557
|
+
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
1558
|
+
const float d8d8 = tmp.x;
|
1559
|
+
const float m8s8 = tmp.y;
|
1537
1560
|
#else
|
1538
|
-
const
|
1539
|
-
const
|
1561
|
+
const float2 dm8f = __half22float2(dm8);
|
1562
|
+
const float2 ds8f = __half22float2(ds8);
|
1563
|
+
const float d8d8 = dm8f.x * ds8f.x;
|
1564
|
+
const float m8s8 = dm8f.y * ds8f.y;
|
1540
1565
|
#endif // GGML_CUDA_F16
|
1541
1566
|
|
1542
1567
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
@@ -1546,6 +1571,312 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1546
1571
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
1572
|
}
|
1548
1573
|
|
1574
|
+
#define VDR_Q2_K_Q8_1_MMVQ 1
|
1575
|
+
#define VDR_Q2_K_Q8_1_MMQ 2
|
1576
|
+
|
1577
|
+
// contiguous v/x values
|
1578
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
1579
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1580
|
+
const half2 & dm2, const float * __restrict__ d8) {
|
1581
|
+
|
1582
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1583
|
+
float sumf_d = 0.0f;
|
1584
|
+
float sumf_m = 0.0f;
|
1585
|
+
|
1586
|
+
#pragma unroll
|
1587
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1588
|
+
const int sc = scales[2*i];
|
1589
|
+
|
1590
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1591
|
+
|
1592
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
1593
|
+
|
1594
|
+
// fill int with 4x m
|
1595
|
+
int m = sc >> 4;
|
1596
|
+
m |= m << 8;
|
1597
|
+
m |= m << 16;
|
1598
|
+
sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
1599
|
+
}
|
1600
|
+
|
1601
|
+
const float2 dm2f = __half22float2(dm2);
|
1602
|
+
|
1603
|
+
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
|
+
#else
|
1605
|
+
return 0.0f; // only to satisfy the compiler
|
1606
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
// contiguous u/y values
|
1610
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
1611
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1612
|
+
const half2 & dm2, const float & d8) {
|
1613
|
+
|
1614
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1615
|
+
int sumi_d = 0;
|
1616
|
+
int sumi_m = 0;
|
1617
|
+
|
1618
|
+
#pragma unroll
|
1619
|
+
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
1620
|
+
int sumi_d_sc = 0;
|
1621
|
+
|
1622
|
+
const int sc = scales[i0 / (QI8_1/2)];
|
1623
|
+
|
1624
|
+
// fill int with 4x m
|
1625
|
+
int m = sc >> 4;
|
1626
|
+
m |= m << 8;
|
1627
|
+
m |= m << 16;
|
1628
|
+
|
1629
|
+
#pragma unroll
|
1630
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1631
|
+
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
1632
|
+
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
sumi_d += sumi_d_sc * (sc & 0xF);
|
1636
|
+
}
|
1637
|
+
|
1638
|
+
const float2 dm2f = __half22float2(dm2);
|
1639
|
+
|
1640
|
+
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
|
+
#else
|
1642
|
+
return 0.0f; // only to satisfy the compiler
|
1643
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
|
+
}
|
1645
|
+
|
1646
|
+
#define VDR_Q3_K_Q8_1_MMVQ 1
|
1647
|
+
#define VDR_Q3_K_Q8_1_MMQ 2
|
1648
|
+
|
1649
|
+
// contiguous v/x values
|
1650
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
1651
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1652
|
+
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
1653
|
+
|
1654
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1655
|
+
float sumf = 0.0f;
|
1656
|
+
|
1657
|
+
#pragma unroll
|
1658
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1659
|
+
const int isc = scale_offset + 2*i;
|
1660
|
+
|
1661
|
+
const int isc_low = isc % (QK_K/32);
|
1662
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1663
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
1664
|
+
|
1665
|
+
const int isc_high = isc % (QK_K/64);
|
1666
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1667
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1668
|
+
|
1669
|
+
const int sc = (sc_low | sc_high) - 32;
|
1670
|
+
|
1671
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1672
|
+
|
1673
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1674
|
+
|
1675
|
+
const int vi = __vsubss4(vil, vih);
|
1676
|
+
|
1677
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
return d3 * sumf;
|
1681
|
+
#else
|
1682
|
+
return 0.0f; // only to satisfy the compiler
|
1683
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
// contiguous u/y values
|
1687
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
1688
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1689
|
+
const float & d3, const float & d8) {
|
1690
|
+
|
1691
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1692
|
+
int sumi = 0;
|
1693
|
+
|
1694
|
+
#pragma unroll
|
1695
|
+
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
1696
|
+
int sumi_sc = 0;
|
1697
|
+
|
1698
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1699
|
+
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
1700
|
+
}
|
1701
|
+
|
1702
|
+
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
return d3*d8 * sumi;
|
1706
|
+
#else
|
1707
|
+
return 0.0f; // only to satisfy the compiler
|
1708
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
#define VDR_Q4_K_Q8_1_MMVQ 2
|
1712
|
+
#define VDR_Q4_K_Q8_1_MMQ 8
|
1713
|
+
|
1714
|
+
// contiguous v/x values
|
1715
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
1716
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1717
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
1718
|
+
|
1719
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1720
|
+
float sumf_d = 0.0f;
|
1721
|
+
float sumf_m = 0.0f;
|
1722
|
+
|
1723
|
+
#pragma unroll
|
1724
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1725
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
1726
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
1727
|
+
|
1728
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
1729
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
1730
|
+
|
1731
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1732
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1733
|
+
}
|
1734
|
+
|
1735
|
+
const float2 dm4f = __half22float2(dm4);
|
1736
|
+
|
1737
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
|
+
|
1739
|
+
#else
|
1740
|
+
return 0.0f; // only to satisfy the compiler
|
1741
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
|
+
}
|
1743
|
+
|
1744
|
+
// contiguous u/y values
|
1745
|
+
// also used for q5_K
|
1746
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1749
|
+
|
1750
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1751
|
+
float sumf_d = 0.0f;
|
1752
|
+
float sumf_m = 0.0f;
|
1753
|
+
|
1754
|
+
#pragma unroll
|
1755
|
+
for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
|
1756
|
+
int sumi_d = 0;
|
1757
|
+
|
1758
|
+
#pragma unroll
|
1759
|
+
for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
|
1760
|
+
sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
|
1761
|
+
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
const float2 ds8f = __half22float2(ds8[i0 / 4]);
|
1765
|
+
|
1766
|
+
sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
|
1767
|
+
sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
|
1768
|
+
}
|
1769
|
+
|
1770
|
+
const float2 dm4f = __half22float2(dm4);
|
1771
|
+
|
1772
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
|
+
|
1774
|
+
#else
|
1775
|
+
return 0.0f; // only to satisfy the compiler
|
1776
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
|
+
}
|
1778
|
+
|
1779
|
+
#define VDR_Q5_K_Q8_1_MMVQ 2
|
1780
|
+
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
|
+
|
1782
|
+
// contiguous v/x values
|
1783
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
1784
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
|
+
|
1787
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1788
|
+
float sumf_d = 0.0f;
|
1789
|
+
float sumf_m = 0.0f;
|
1790
|
+
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1793
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
1794
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
1795
|
+
|
1796
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
1797
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
1798
|
+
|
1799
|
+
const int v0i = vl0i | vh0i;
|
1800
|
+
const int v1i = vl1i | vh1i;
|
1801
|
+
|
1802
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
1803
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
1804
|
+
|
1805
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1806
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
1807
|
+
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
const float2 dm5f = __half22float2(dm5);
|
1811
|
+
|
1812
|
+
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
|
+
|
1814
|
+
#else
|
1815
|
+
return 0.0f; // only to satisfy the compiler
|
1816
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
|
+
}
|
1818
|
+
|
1819
|
+
#define VDR_Q6_K_Q8_1_MMVQ 1
|
1820
|
+
#define VDR_Q6_K_Q8_1_MMQ 8
|
1821
|
+
|
1822
|
+
// contiguous v/x values
|
1823
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
1824
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1825
|
+
const float & d, const float * __restrict__ d8) {
|
1826
|
+
|
1827
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1828
|
+
float sumf = 0.0f;
|
1829
|
+
|
1830
|
+
#pragma unroll
|
1831
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1832
|
+
const int sc = scales[4*i];
|
1833
|
+
|
1834
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1835
|
+
|
1836
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
1837
|
+
|
1838
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1839
|
+
|
1840
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1841
|
+
}
|
1842
|
+
|
1843
|
+
return d*sumf;
|
1844
|
+
#else
|
1845
|
+
return 0.0f; // only to satisfy the compiler
|
1846
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
|
+
}
|
1848
|
+
|
1849
|
+
// contiguous u/y values
|
1850
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
1851
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
1852
|
+
const float & d6, const float * __restrict__ d8) {
|
1853
|
+
|
1854
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1855
|
+
float sumf_d = 0.0f;
|
1856
|
+
|
1857
|
+
#pragma unroll
|
1858
|
+
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
1859
|
+
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
1860
|
+
|
1861
|
+
#pragma unroll
|
1862
|
+
for (int i = i0; i < i0 + 2; ++i) {
|
1863
|
+
sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
1864
|
+
sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
1865
|
+
|
1866
|
+
sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
1867
|
+
sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
1868
|
+
}
|
1869
|
+
|
1870
|
+
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
1871
|
+
}
|
1872
|
+
|
1873
|
+
return d6 * sumf_d;
|
1874
|
+
|
1875
|
+
#else
|
1876
|
+
return 0.0f; // only to satisfy the compiler
|
1877
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
|
+
}
|
1879
|
+
|
1549
1880
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1550
1881
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1551
1882
|
|
@@ -1564,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
1564
1895
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
1896
|
}
|
1566
1897
|
|
1567
|
-
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1898
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
1899
|
|
1569
|
-
__shared__ int tile_x_qs[
|
1570
|
-
__shared__ float tile_x_d[
|
1900
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
1901
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
1571
1902
|
|
1572
1903
|
*x_ql = tile_x_qs;
|
1573
1904
|
*x_dm = (half2 *) tile_x_d;
|
1574
1905
|
}
|
1575
1906
|
|
1576
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1907
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
1908
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
1909
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
1910
|
|
1580
1911
|
__builtin_assume(i_offset >= 0);
|
1581
|
-
__builtin_assume(i_offset <
|
1912
|
+
__builtin_assume(i_offset < nwarps);
|
1582
1913
|
__builtin_assume(k >= 0);
|
1583
1914
|
__builtin_assume(k < WARP_SIZE);
|
1584
1915
|
|
@@ -1590,7 +1921,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1590
1921
|
float * x_dmf = (float *) x_dm;
|
1591
1922
|
|
1592
1923
|
#pragma unroll
|
1593
|
-
for (int i0 = 0; i0 <
|
1924
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1594
1925
|
int i = i0 + i_offset;
|
1595
1926
|
|
1596
1927
|
if (need_check) {
|
@@ -1600,38 +1931,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1600
1931
|
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
1932
|
|
1602
1933
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
-
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1934
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1604
1935
|
}
|
1605
1936
|
|
1606
|
-
|
1607
|
-
|
1937
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1938
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1608
1939
|
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1940
|
+
#pragma unroll
|
1941
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
1942
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
1943
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1944
|
+
if (need_check) {
|
1945
|
+
i = min(i, i_max);
|
1946
|
+
}
|
1617
1947
|
|
1618
|
-
|
1948
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
1949
|
|
1620
|
-
|
1621
|
-
|
1950
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
1951
|
+
}
|
1622
1952
|
}
|
1623
1953
|
|
1624
1954
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
1955
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
1956
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1627
1957
|
|
1628
|
-
__builtin_assume(i >= 0);
|
1629
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
-
__builtin_assume(j >= 0);
|
1631
|
-
__builtin_assume(j < WARP_SIZE);
|
1632
|
-
__builtin_assume(k >= 0);
|
1633
|
-
__builtin_assume(k < WARP_SIZE);
|
1634
|
-
|
1635
1958
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
1959
|
const float * x_dmf = (float *) x_dm;
|
1637
1960
|
|
@@ -1639,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
|
1639
1962
|
|
1640
1963
|
#pragma unroll
|
1641
1964
|
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
-
u[2*l+0] = y_qs[j *
|
1643
|
-
u[2*l+1] = y_qs[j *
|
1965
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
1966
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
1644
1967
|
}
|
1645
1968
|
|
1646
1969
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
1970
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
-
y_ds[j * (
|
1971
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1649
1972
|
}
|
1650
1973
|
|
1651
1974
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
@@ -1666,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
1666
1989
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
1990
|
}
|
1668
1991
|
|
1669
|
-
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1992
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1670
1993
|
|
1671
|
-
__shared__ int tile_x_qs[
|
1672
|
-
__shared__ half2 tile_x_dm[
|
1994
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
1995
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
1673
1996
|
|
1674
1997
|
*x_ql = tile_x_qs;
|
1675
1998
|
*x_dm = tile_x_dm;
|
1676
1999
|
}
|
1677
2000
|
|
1678
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2001
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
2002
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
2003
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
2004
|
|
1682
2005
|
__builtin_assume(i_offset >= 0);
|
1683
|
-
__builtin_assume(i_offset <
|
2006
|
+
__builtin_assume(i_offset < nwarps);
|
1684
2007
|
__builtin_assume(k >= 0);
|
1685
2008
|
__builtin_assume(k < WARP_SIZE);
|
1686
2009
|
|
@@ -1690,7 +2013,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1690
2013
|
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1691
2014
|
|
1692
2015
|
#pragma unroll
|
1693
|
-
for (int i0 = 0; i0 <
|
2016
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1694
2017
|
int i = i0 + i_offset;
|
1695
2018
|
|
1696
2019
|
if (need_check) {
|
@@ -1706,7 +2029,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1706
2029
|
const int kbxd = k % blocks_per_tile_x_row;
|
1707
2030
|
|
1708
2031
|
#pragma unroll
|
1709
|
-
for (int i0 = 0; i0 <
|
2032
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
1710
2033
|
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
2034
|
|
1712
2035
|
if (need_check) {
|
@@ -1723,26 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
|
1723
2046
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
2047
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1725
2048
|
|
1726
|
-
__builtin_assume(i >= 0);
|
1727
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
-
__builtin_assume(j >= 0);
|
1729
|
-
__builtin_assume(j < WARP_SIZE);
|
1730
|
-
__builtin_assume(k >= 0);
|
1731
|
-
__builtin_assume(k < WARP_SIZE);
|
1732
|
-
|
1733
2049
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1734
2050
|
|
1735
2051
|
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1736
2052
|
|
1737
2053
|
#pragma unroll
|
1738
2054
|
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
-
u[2*l+0] = y_qs[j *
|
1740
|
-
u[2*l+1] = y_qs[j *
|
2055
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2056
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
1741
2057
|
}
|
1742
2058
|
|
1743
2059
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
2060
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
-
y_ds[j * (
|
2061
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1746
2062
|
}
|
1747
2063
|
|
1748
2064
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
@@ -1765,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
1765
2081
|
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
2082
|
}
|
1767
2083
|
|
1768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2084
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
2085
|
|
1770
|
-
__shared__ int tile_x_ql[
|
1771
|
-
__shared__ float tile_x_d[
|
2086
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2087
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
1772
2088
|
|
1773
2089
|
*x_ql = tile_x_ql;
|
1774
2090
|
*x_dm = (half2 *) tile_x_d;
|
1775
2091
|
}
|
1776
2092
|
|
1777
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2093
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
2094
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
2095
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
2096
|
|
1781
2097
|
__builtin_assume(i_offset >= 0);
|
1782
|
-
__builtin_assume(i_offset <
|
2098
|
+
__builtin_assume(i_offset < nwarps);
|
1783
2099
|
__builtin_assume(k >= 0);
|
1784
2100
|
__builtin_assume(k < WARP_SIZE);
|
1785
2101
|
|
@@ -1789,7 +2105,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1789
2105
|
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
2106
|
|
1791
2107
|
#pragma unroll
|
1792
|
-
for (int i0 = 0; i0 <
|
2108
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1793
2109
|
int i = i0 + i_offset;
|
1794
2110
|
|
1795
2111
|
if (need_check) {
|
@@ -1825,7 +2141,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1825
2141
|
float * x_dmf = (float *) x_dm;
|
1826
2142
|
|
1827
2143
|
#pragma unroll
|
1828
|
-
for (int i0 = 0; i0 <
|
2144
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
1829
2145
|
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
2146
|
|
1831
2147
|
if (need_check) {
|
@@ -1842,27 +2158,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
|
1842
2158
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
2159
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
2160
|
|
1845
|
-
__builtin_assume(i >= 0);
|
1846
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
-
__builtin_assume(j >= 0);
|
1848
|
-
__builtin_assume(j < WARP_SIZE);
|
1849
|
-
__builtin_assume(k >= 0);
|
1850
|
-
__builtin_assume(k < WARP_SIZE);
|
1851
|
-
|
1852
2161
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
2162
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
-
const float * x_dmf = (float *) x_dm;
|
2163
|
+
const float * x_dmf = (const float *) x_dm;
|
2164
|
+
const float * y_df = (const float *) y_ds;
|
1855
2165
|
|
1856
2166
|
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
2167
|
|
1858
2168
|
#pragma unroll
|
1859
2169
|
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
-
u[2*l+0] = y_qs[j *
|
1861
|
-
u[2*l+1] = y_qs[j *
|
2170
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2171
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
1862
2172
|
}
|
1863
2173
|
|
1864
2174
|
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
|
2175
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1866
2176
|
}
|
1867
2177
|
|
1868
2178
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
@@ -1885,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
1885
2195
|
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
2196
|
}
|
1887
2197
|
|
1888
|
-
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2198
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
2199
|
|
1890
|
-
__shared__ int tile_x_ql[
|
1891
|
-
__shared__ half2 tile_x_dm[
|
2200
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2201
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
1892
2202
|
|
1893
2203
|
*x_ql = tile_x_ql;
|
1894
2204
|
*x_dm = tile_x_dm;
|
1895
2205
|
}
|
1896
2206
|
|
1897
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2207
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
2208
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
2209
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
2210
|
|
1901
2211
|
__builtin_assume(i_offset >= 0);
|
1902
|
-
__builtin_assume(i_offset <
|
2212
|
+
__builtin_assume(i_offset < nwarps);
|
1903
2213
|
__builtin_assume(k >= 0);
|
1904
2214
|
__builtin_assume(k < WARP_SIZE);
|
1905
2215
|
|
@@ -1909,7 +2219,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1909
2219
|
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
2220
|
|
1911
2221
|
#pragma unroll
|
1912
|
-
for (int i0 = 0; i0 <
|
2222
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1913
2223
|
int i = i0 + i_offset;
|
1914
2224
|
|
1915
2225
|
if (need_check) {
|
@@ -1942,7 +2252,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1942
2252
|
const int kbxd = k % blocks_per_tile_x_row;
|
1943
2253
|
|
1944
2254
|
#pragma unroll
|
1945
|
-
for (int i0 = 0; i0 <
|
2255
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
1946
2256
|
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
2257
|
|
1948
2258
|
if (need_check) {
|
@@ -1959,13 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1959
2269
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
2270
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
2271
|
|
1962
|
-
__builtin_assume(i >= 0);
|
1963
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
-
__builtin_assume(j >= 0);
|
1965
|
-
__builtin_assume(j < WARP_SIZE);
|
1966
|
-
__builtin_assume(k >= 0);
|
1967
|
-
__builtin_assume(k < WARP_SIZE);
|
1968
|
-
|
1969
2272
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
2273
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
2274
|
|
@@ -1973,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1973
2276
|
|
1974
2277
|
#pragma unroll
|
1975
2278
|
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
-
u[2*l+0] = y_qs[j *
|
1977
|
-
u[2*l+1] = y_qs[j *
|
2279
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2280
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
1978
2281
|
}
|
1979
2282
|
|
1980
2283
|
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (
|
2284
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1982
2285
|
}
|
1983
2286
|
|
1984
2287
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
@@ -1989,29 +2292,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
1989
2292
|
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
2293
|
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
2294
|
|
2295
|
+
#pragma unroll
|
1992
2296
|
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
2297
|
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
2298
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
2299
|
}
|
1996
2300
|
|
1997
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
2301
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
1998
2302
|
}
|
1999
2303
|
|
2000
|
-
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2304
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
2305
|
|
2002
|
-
__shared__ int tile_x_qs[
|
2003
|
-
__shared__ float tile_x_d[
|
2306
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2307
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
2004
2308
|
|
2005
2309
|
*x_ql = tile_x_qs;
|
2006
2310
|
*x_dm = (half2 *) tile_x_d;
|
2007
2311
|
}
|
2008
2312
|
|
2009
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2313
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
2314
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
2315
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
2316
|
|
2013
2317
|
__builtin_assume(i_offset >= 0);
|
2014
|
-
__builtin_assume(i_offset <
|
2318
|
+
__builtin_assume(i_offset < nwarps);
|
2015
2319
|
__builtin_assume(k >= 0);
|
2016
2320
|
__builtin_assume(k < WARP_SIZE);
|
2017
2321
|
|
@@ -2022,7 +2326,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2022
2326
|
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
2327
|
|
2024
2328
|
#pragma unroll
|
2025
|
-
for (int i0 = 0; i0 <
|
2329
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2026
2330
|
int i = i0 + i_offset;
|
2027
2331
|
|
2028
2332
|
if (need_check) {
|
@@ -2032,76 +2336,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2032
2336
|
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
2337
|
|
2034
2338
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
-
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
2339
|
}
|
2037
2340
|
|
2038
|
-
|
2039
|
-
|
2341
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2342
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2040
2343
|
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2344
|
+
#pragma unroll
|
2345
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
2346
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
2347
|
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
// }
|
2050
|
-
// #endif // GGML_CUDA_MMQ_Y < 64
|
2348
|
+
if (need_check) {
|
2349
|
+
i = min(i, i_max);
|
2350
|
+
}
|
2051
2351
|
|
2052
|
-
|
2352
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
2353
|
|
2054
|
-
|
2055
|
-
|
2354
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
2355
|
+
}
|
2056
2356
|
}
|
2057
2357
|
|
2058
2358
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
2359
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
2360
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
2361
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
__builtin_assume(j >= 0);
|
2065
|
-
__builtin_assume(j < WARP_SIZE);
|
2066
|
-
__builtin_assume(k >= 0);
|
2067
|
-
__builtin_assume(k < WARP_SIZE);
|
2068
|
-
|
2069
|
-
const float * x_dmf = (float *) x_dm;
|
2362
|
+
const float * x_dmf = (const float *) x_dm;
|
2363
|
+
const float * y_df = (const float *) y_ds;
|
2070
2364
|
|
2071
2365
|
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
2366
|
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
-
|
2074
|
-
}
|
2075
|
-
|
2076
|
-
#define VDR_q2_K_q8_1 1
|
2077
|
-
|
2078
|
-
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
-
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
-
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
-
|
2082
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
-
float sumf_d = 0.0f;
|
2084
|
-
float sumf_m = 0.0f;
|
2085
|
-
|
2086
|
-
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
-
const int sc = scales[2*i];
|
2088
|
-
|
2089
|
-
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
-
|
2091
|
-
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
-
|
2093
|
-
int sc_high = sc >> 4;
|
2094
|
-
sc_high |= sc_high << 8;
|
2095
|
-
sc_high |= sc_high << 16;
|
2096
|
-
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
-
}
|
2098
|
-
|
2099
|
-
const float2 dmf = __half22float2(dm);
|
2100
|
-
|
2101
|
-
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
-
#else
|
2103
|
-
return 0.0f; // only to satisfy the compiler
|
2104
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2367
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2105
2368
|
}
|
2106
2369
|
|
2107
2370
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
@@ -2115,34 +2378,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2115
2378
|
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
2379
|
|
2117
2380
|
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
-
int
|
2381
|
+
int u[QR2_K];
|
2119
2382
|
float d8[QR2_K];
|
2120
2383
|
|
2384
|
+
#pragma unroll
|
2121
2385
|
for (int i = 0; i < QR2_K; ++ i) {
|
2122
2386
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
2387
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
2388
|
}
|
2125
2389
|
|
2126
|
-
return
|
2390
|
+
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
2127
2391
|
}
|
2128
2392
|
|
2129
|
-
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2393
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
2394
|
|
2131
|
-
__shared__ int tile_x_ql[
|
2132
|
-
__shared__ half2 tile_x_dm[
|
2133
|
-
__shared__ int tile_x_sc[
|
2395
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2396
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
2397
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2134
2398
|
|
2135
2399
|
*x_ql = tile_x_ql;
|
2136
2400
|
*x_dm = tile_x_dm;
|
2137
2401
|
*x_sc = tile_x_sc;
|
2138
2402
|
}
|
2139
2403
|
|
2140
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2404
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
2405
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
2406
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
2407
|
|
2144
2408
|
__builtin_assume(i_offset >= 0);
|
2145
|
-
__builtin_assume(i_offset <
|
2409
|
+
__builtin_assume(i_offset < nwarps);
|
2146
2410
|
__builtin_assume(k >= 0);
|
2147
2411
|
__builtin_assume(k < WARP_SIZE);
|
2148
2412
|
|
@@ -2152,7 +2416,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2152
2416
|
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
2417
|
|
2154
2418
|
#pragma unroll
|
2155
|
-
for (int i0 = 0; i0 <
|
2419
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2156
2420
|
int i = i0 + i_offset;
|
2157
2421
|
|
2158
2422
|
if (need_check) {
|
@@ -2168,8 +2432,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2168
2432
|
const int kbxd = k % blocks_per_tile_x_row;
|
2169
2433
|
|
2170
2434
|
#pragma unroll
|
2171
|
-
for (int i0 = 0; i0 <
|
2172
|
-
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) %
|
2435
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
2436
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
2173
2437
|
|
2174
2438
|
if (need_check) {
|
2175
2439
|
i = min(i, i_max);
|
@@ -2181,7 +2445,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2181
2445
|
}
|
2182
2446
|
|
2183
2447
|
#pragma unroll
|
2184
|
-
for (int i0 = 0; i0 <
|
2448
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2185
2449
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
2450
|
|
2187
2451
|
if (need_check) {
|
@@ -2198,68 +2462,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
|
2198
2462
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
2463
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
2464
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
__builtin_assume(j < WARP_SIZE);
|
2205
|
-
__builtin_assume(k >= 0);
|
2206
|
-
__builtin_assume(k < WARP_SIZE);
|
2207
|
-
|
2208
|
-
const int kbx = k / QI2_K;
|
2209
|
-
const int kqsx = k % QI2_K;
|
2465
|
+
const int kbx = k / QI2_K;
|
2466
|
+
const int ky = (k % QI2_K) * QR2_K;
|
2467
|
+
const float * y_df = (const float *) y_ds;
|
2210
2468
|
|
2211
|
-
|
2212
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2469
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
2213
2470
|
|
2214
|
-
const
|
2471
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
2472
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
2215
2473
|
|
2216
|
-
|
2217
|
-
|
2218
|
-
|
2219
|
-
for (int l = 0; l < QR2_K; ++ l) {
|
2220
|
-
const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2221
|
-
u[l] = y_qs[y_qs_index];
|
2222
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2474
|
+
#pragma unroll
|
2475
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
2476
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2223
2477
|
}
|
2224
2478
|
|
2225
|
-
|
2226
|
-
}
|
2227
|
-
|
2228
|
-
#define VDR_q3_K_q8_1 1
|
2229
|
-
|
2230
|
-
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
-
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
-
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
-
|
2234
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
-
float sumf = 0.0f;
|
2236
|
-
|
2237
|
-
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
-
const int isc = scale_offset + 2*i;
|
2479
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
2239
2480
|
|
2240
|
-
|
2241
|
-
|
2242
|
-
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
-
|
2244
|
-
const int isc_high = isc % (QK_K/64);
|
2245
|
-
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
-
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
-
|
2248
|
-
const int sc = (sc_low | sc_high) - 32;
|
2249
|
-
|
2250
|
-
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
-
|
2252
|
-
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
-
|
2254
|
-
const int vi = __vsubss4(vil, vih);
|
2255
|
-
|
2256
|
-
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2257
|
-
}
|
2258
|
-
|
2259
|
-
return d*sumf;
|
2260
|
-
#else
|
2261
|
-
return 0.0f; // only to satisfy the compiler
|
2262
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2481
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
2482
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
2263
2483
|
}
|
2264
2484
|
|
2265
2485
|
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
@@ -2277,23 +2497,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2277
2497
|
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
2498
|
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
2499
|
|
2280
|
-
int
|
2500
|
+
int u[QR3_K];
|
2281
2501
|
float d8[QR3_K];
|
2282
2502
|
|
2503
|
+
#pragma unroll
|
2283
2504
|
for (int i = 0; i < QR3_K; ++i) {
|
2284
2505
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
2506
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
2507
|
}
|
2287
2508
|
|
2288
|
-
return
|
2509
|
+
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
2510
|
}
|
2290
2511
|
|
2291
|
-
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2512
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
2513
|
|
2293
|
-
__shared__ int tile_x_ql[
|
2294
|
-
__shared__ half2 tile_x_dm[
|
2295
|
-
__shared__ int tile_x_qh[
|
2296
|
-
__shared__ int tile_x_sc[
|
2514
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2515
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
2516
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
2517
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2297
2518
|
|
2298
2519
|
*x_ql = tile_x_ql;
|
2299
2520
|
*x_dm = tile_x_dm;
|
@@ -2301,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
|
|
2301
2522
|
*x_sc = tile_x_sc;
|
2302
2523
|
}
|
2303
2524
|
|
2304
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2525
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
2526
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
2527
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
2528
|
|
2308
2529
|
__builtin_assume(i_offset >= 0);
|
2309
|
-
__builtin_assume(i_offset <
|
2530
|
+
__builtin_assume(i_offset < nwarps);
|
2310
2531
|
__builtin_assume(k >= 0);
|
2311
2532
|
__builtin_assume(k < WARP_SIZE);
|
2312
2533
|
|
@@ -2316,7 +2537,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2316
2537
|
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
2538
|
|
2318
2539
|
#pragma unroll
|
2319
|
-
for (int i0 = 0; i0 <
|
2540
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2320
2541
|
int i = i0 + i_offset;
|
2321
2542
|
|
2322
2543
|
if (need_check) {
|
@@ -2330,10 +2551,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2330
2551
|
|
2331
2552
|
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
2553
|
const int kbxd = k % blocks_per_tile_x_row;
|
2554
|
+
float * x_dmf = (float *) x_dm;
|
2333
2555
|
|
2334
2556
|
#pragma unroll
|
2335
|
-
for (int i0 = 0; i0 <
|
2336
|
-
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) %
|
2557
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
2558
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
2337
2559
|
|
2338
2560
|
if (need_check) {
|
2339
2561
|
i = min(i, i_max);
|
@@ -2341,11 +2563,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2341
2563
|
|
2342
2564
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
2565
|
|
2344
|
-
|
2566
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
2345
2567
|
}
|
2346
2568
|
|
2347
2569
|
#pragma unroll
|
2348
|
-
for (int i0 = 0; i0 <
|
2570
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
2349
2571
|
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
2572
|
|
2351
2573
|
if (need_check) {
|
@@ -2354,11 +2576,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2354
2576
|
|
2355
2577
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
2578
|
|
2357
|
-
|
2579
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2580
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
2581
|
}
|
2359
2582
|
|
2360
2583
|
#pragma unroll
|
2361
|
-
for (int i0 = 0; i0 <
|
2584
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2362
2585
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
2586
|
|
2364
2587
|
if (need_check) {
|
@@ -2367,7 +2590,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2367
2590
|
|
2368
2591
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
2592
|
|
2370
|
-
|
2593
|
+
const int ksc = k % (QI3_K/4);
|
2594
|
+
|
2595
|
+
const int ksc_low = ksc % (QI3_K/8);
|
2596
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
2597
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
2598
|
+
|
2599
|
+
const int ksc_high = QI3_K/8;
|
2600
|
+
const int shift_high = 2 * ksc;
|
2601
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
2602
|
+
|
2603
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
2604
|
+
|
2605
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
2371
2606
|
}
|
2372
2607
|
}
|
2373
2608
|
|
@@ -2375,63 +2610,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2375
2610
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2611
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
2612
|
|
2378
|
-
__builtin_assume(i >= 0);
|
2379
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
-
__builtin_assume(j >= 0);
|
2381
|
-
__builtin_assume(j < WARP_SIZE);
|
2382
|
-
__builtin_assume(k >= 0);
|
2383
|
-
__builtin_assume(k < WARP_SIZE);
|
2384
|
-
|
2385
2613
|
const int kbx = k / QI3_K;
|
2386
|
-
const int
|
2387
|
-
|
2388
|
-
const
|
2389
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2390
|
-
|
2391
|
-
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
|
2614
|
+
const int ky = (k % QI3_K) * QR3_K;
|
2615
|
+
const float * x_dmf = (const float *) x_dm;
|
2616
|
+
const float * y_df = (const float *) y_ds;
|
2392
2617
|
|
2393
|
-
|
2394
|
-
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2618
|
+
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2395
2619
|
|
2396
|
-
int
|
2397
|
-
float d8[QR3_K];
|
2398
|
-
|
2399
|
-
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
-
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
-
u[l] = y_qs[y_qs_index];
|
2402
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
-
}
|
2404
|
-
|
2405
|
-
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
-
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
-
}
|
2408
|
-
|
2409
|
-
#define VDR_q4_K_q8_1 2
|
2410
|
-
|
2411
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
-
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
-
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
-
|
2415
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
-
float sumf_d = 0.0f;
|
2417
|
-
float sumf_m = 0.0f;
|
2620
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2418
2621
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
const int
|
2622
|
+
#pragma unroll
|
2623
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
2624
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
2625
|
+
const int shift = 2 * ((ky % 32) / 8);
|
2626
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2422
2627
|
|
2423
|
-
const int
|
2424
|
-
const int
|
2628
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
2629
|
+
const int vlh = (vh << 2) & 0x04040404;
|
2425
2630
|
|
2426
|
-
|
2427
|
-
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2631
|
+
v[l] = __vsubss4(vll, vlh);
|
2428
2632
|
}
|
2429
2633
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
#else
|
2433
|
-
return 0.0f; // only to satisfy the compiler
|
2434
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2634
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
2635
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
2435
2636
|
}
|
2436
2637
|
|
2437
2638
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
@@ -2478,7 +2679,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2478
2679
|
u[2*i+1] = q8[4];
|
2479
2680
|
}
|
2480
2681
|
|
2481
|
-
return
|
2682
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
2482
2683
|
|
2483
2684
|
#else
|
2484
2685
|
|
@@ -2527,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2527
2728
|
#endif
|
2528
2729
|
}
|
2529
2730
|
|
2530
|
-
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2731
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
2732
|
|
2532
|
-
__shared__ int tile_x_ql[
|
2533
|
-
__shared__ half2 tile_x_dm[
|
2534
|
-
__shared__ int tile_x_sc[
|
2733
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2734
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
2735
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2535
2736
|
|
2536
2737
|
*x_ql = tile_x_ql;
|
2537
2738
|
*x_dm = tile_x_dm;
|
2538
2739
|
*x_sc = tile_x_sc;
|
2539
2740
|
}
|
2540
2741
|
|
2541
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2742
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
2743
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
2744
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
2745
|
|
2545
2746
|
__builtin_assume(i_offset >= 0);
|
2546
|
-
__builtin_assume(i_offset <
|
2747
|
+
__builtin_assume(i_offset < nwarps);
|
2547
2748
|
__builtin_assume(k >= 0);
|
2548
2749
|
__builtin_assume(k < WARP_SIZE);
|
2549
2750
|
|
@@ -2553,7 +2754,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2553
2754
|
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
2755
|
|
2555
2756
|
#pragma unroll
|
2556
|
-
for (int i0 = 0; i0 <
|
2757
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2557
2758
|
int i = i0 + i_offset;
|
2558
2759
|
|
2559
2760
|
if (need_check) {
|
@@ -2563,118 +2764,62 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2563
2764
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2564
2765
|
|
2565
2766
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2566
|
-
}
|
2567
|
-
|
2568
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
|
-
|
2571
|
-
#pragma unroll
|
2572
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
|
2573
|
-
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2574
|
-
|
2575
|
-
if (need_check) {
|
2576
|
-
i = min(i, i_max);
|
2577
|
-
}
|
2578
|
-
|
2579
|
-
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2580
|
-
|
2581
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2582
|
-
}
|
2583
|
-
|
2584
|
-
#pragma unroll
|
2585
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2586
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2587
|
-
|
2588
|
-
if (need_check) {
|
2589
|
-
i = min(i, i_max);
|
2590
|
-
}
|
2591
|
-
|
2592
|
-
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
|
-
|
2594
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
|
2595
|
-
}
|
2596
|
-
}
|
2597
|
-
|
2598
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
-
|
2602
|
-
__builtin_assume(i >= 0);
|
2603
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
-
__builtin_assume(j >= 0);
|
2605
|
-
__builtin_assume(j < WARP_SIZE);
|
2606
|
-
__builtin_assume(k >= 0);
|
2607
|
-
__builtin_assume(k < WARP_SIZE);
|
2608
|
-
|
2609
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
-
|
2612
|
-
int v[2];
|
2613
|
-
int u[2*QR4_K];
|
2614
|
-
float d8[QR4_K];
|
2767
|
+
}
|
2615
2768
|
|
2616
|
-
|
2617
|
-
const int
|
2769
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2770
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2618
2771
|
|
2619
|
-
|
2620
|
-
|
2772
|
+
#pragma unroll
|
2773
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
2774
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
2621
2775
|
|
2622
|
-
|
2623
|
-
|
2624
|
-
|
2625
|
-
|
2626
|
-
|
2627
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
-
} else {
|
2629
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2631
|
-
}
|
2632
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
-
const uint8_t * m = sc + 2;
|
2776
|
+
if (need_check) {
|
2777
|
+
i = min(i, i_max);
|
2778
|
+
}
|
2779
|
+
|
2780
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2634
2781
|
|
2635
|
-
|
2636
|
-
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2782
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2640
2783
|
}
|
2641
2784
|
|
2642
|
-
|
2643
|
-
|
2785
|
+
#pragma unroll
|
2786
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2787
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2644
2788
|
|
2645
|
-
|
2789
|
+
if (need_check) {
|
2790
|
+
i = min(i, i_max);
|
2791
|
+
}
|
2646
2792
|
|
2647
|
-
|
2648
|
-
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
-
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2793
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2650
2794
|
|
2651
|
-
|
2652
|
-
float sumf_d = 0.0f;
|
2653
|
-
float sumf_m = 0.0f;
|
2795
|
+
const int * scales = (int *) bxi->scales;
|
2654
2796
|
|
2655
|
-
|
2656
|
-
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
-
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2797
|
+
const int ksc = k % (WARP_SIZE/8);
|
2658
2798
|
|
2659
|
-
|
2660
|
-
|
2799
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2800
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2801
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2661
2802
|
|
2662
|
-
|
2663
|
-
|
2803
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2804
|
+
}
|
2805
|
+
}
|
2664
2806
|
|
2665
|
-
|
2666
|
-
|
2807
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2808
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2667
2810
|
|
2668
|
-
|
2669
|
-
sumf_m += d8[i] * (dot2 * m[i]);
|
2811
|
+
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2670
2812
|
|
2813
|
+
#pragma unroll
|
2814
|
+
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
+
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
+
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2671
2817
|
}
|
2672
2818
|
|
2673
|
-
|
2819
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2674
2820
|
|
2675
|
-
|
2676
|
-
return
|
2677
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2821
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
+
return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2678
2823
|
}
|
2679
2824
|
|
2680
2825
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2711,6 +2856,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2711
2856
|
const uint8_t * sc = (const uint8_t *)aux;
|
2712
2857
|
const uint8_t * m = sc + 2;
|
2713
2858
|
|
2859
|
+
#pragma unroll
|
2714
2860
|
for (int i = 0; i < QR5_K; ++i) {
|
2715
2861
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
2862
|
d8[i] = bq8i->ds.x;
|
@@ -2765,25 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2765
2911
|
#endif
|
2766
2912
|
}
|
2767
2913
|
|
2768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2914
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
2915
|
|
2770
|
-
__shared__ int tile_x_ql[
|
2771
|
-
__shared__ half2 tile_x_dm[
|
2772
|
-
__shared__ int
|
2773
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2916
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2917
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
2918
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2774
2919
|
|
2775
2920
|
*x_ql = tile_x_ql;
|
2776
2921
|
*x_dm = tile_x_dm;
|
2777
|
-
*x_qh = tile_x_qh;
|
2778
2922
|
*x_sc = tile_x_sc;
|
2779
2923
|
}
|
2780
2924
|
|
2781
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2925
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
2926
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
2927
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
2928
|
|
2785
2929
|
__builtin_assume(i_offset >= 0);
|
2786
|
-
__builtin_assume(i_offset <
|
2930
|
+
__builtin_assume(i_offset < nwarps);
|
2787
2931
|
__builtin_assume(k >= 0);
|
2788
2932
|
__builtin_assume(k < WARP_SIZE);
|
2789
2933
|
|
@@ -2793,7 +2937,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2793
2937
|
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
2938
|
|
2795
2939
|
#pragma unroll
|
2796
|
-
for (int i0 = 0; i0 <
|
2940
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2797
2941
|
int i = i0 + i_offset;
|
2798
2942
|
|
2799
2943
|
if (need_check) {
|
@@ -2801,16 +2945,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2801
2945
|
}
|
2802
2946
|
|
2803
2947
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2948
|
+
const int ky = QR5_K*kqsx;
|
2804
2949
|
|
2805
|
-
|
2950
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2951
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
2952
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2953
|
+
|
2954
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
2955
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
2956
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
2957
|
+
|
2958
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
2959
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
2960
|
+
|
2961
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
2962
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
2806
2963
|
}
|
2807
2964
|
|
2808
2965
|
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
2966
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
2967
|
|
2811
2968
|
#pragma unroll
|
2812
|
-
for (int i0 = 0; i0 <
|
2813
|
-
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) %
|
2969
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
2970
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
2814
2971
|
|
2815
2972
|
if (need_check) {
|
2816
2973
|
i = min(i, i_max);
|
@@ -2822,29 +2979,24 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2822
2979
|
}
|
2823
2980
|
|
2824
2981
|
#pragma unroll
|
2825
|
-
for (int i0 = 0; i0 <
|
2826
|
-
int i = i0 + i_offset *
|
2982
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2983
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2827
2984
|
|
2828
2985
|
if (need_check) {
|
2829
2986
|
i = min(i, i_max);
|
2830
2987
|
}
|
2831
2988
|
|
2832
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/
|
2833
|
-
|
2834
|
-
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
-
}
|
2989
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2836
2990
|
|
2837
|
-
|
2838
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2991
|
+
const int * scales = (int *) bxi->scales;
|
2840
2992
|
|
2841
|
-
|
2842
|
-
i = min(i, i_max);
|
2843
|
-
}
|
2993
|
+
const int ksc = k % (WARP_SIZE/8);
|
2844
2994
|
|
2845
|
-
|
2995
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2996
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2997
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2846
2998
|
|
2847
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 +
|
2999
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2848
3000
|
}
|
2849
3001
|
}
|
2850
3002
|
|
@@ -2852,77 +3004,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
2852
3004
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
3005
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
3006
|
|
2855
|
-
|
2856
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
-
__builtin_assume(j >= 0);
|
2858
|
-
__builtin_assume(j < WARP_SIZE);
|
2859
|
-
__builtin_assume(k >= 0);
|
2860
|
-
__builtin_assume(k < WARP_SIZE);
|
2861
|
-
|
2862
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
-
|
2865
|
-
int vl[2];
|
2866
|
-
int vh[2];
|
2867
|
-
int u[2*QR4_K];
|
2868
|
-
float d8[QR4_K];
|
2869
|
-
|
2870
|
-
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
-
|
2872
|
-
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
-
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
-
|
2875
|
-
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
-
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
-
|
2878
|
-
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
-
uint16_t aux[2];
|
2880
|
-
const int l = bq8_offset/2;
|
2881
|
-
if (l < 2) {
|
2882
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
-
} else {
|
2885
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
-
}
|
2888
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
-
const uint8_t * m = sc + 2;
|
2890
|
-
|
2891
|
-
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
-
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
-
}
|
2897
|
-
|
2898
|
-
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
-
}
|
2900
|
-
|
2901
|
-
#define VDR_q6_K_q8_1 1
|
2902
|
-
|
2903
|
-
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
-
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
-
const float & d, const float * __restrict__ d8) {
|
2906
|
-
|
2907
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
-
float sumf = 0.0f;
|
2909
|
-
|
2910
|
-
for (int i = 0; i < QR6_K; ++i) {
|
2911
|
-
const int sc = scales[4*i];
|
2912
|
-
|
2913
|
-
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
2914
|
-
|
2915
|
-
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
2916
|
-
|
2917
|
-
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
2918
|
-
|
2919
|
-
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2920
|
-
}
|
3007
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
2921
3008
|
|
2922
|
-
|
2923
|
-
|
2924
|
-
return
|
2925
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3009
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
2926
3012
|
}
|
2927
3013
|
|
2928
3014
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -2942,33 +3028,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
2942
3028
|
int u[QR6_K];
|
2943
3029
|
float d8[QR6_K];
|
2944
3030
|
|
3031
|
+
#pragma unroll
|
2945
3032
|
for (int i = 0; i < QR6_K; ++i) {
|
2946
3033
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
3034
|
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
2948
3035
|
}
|
2949
3036
|
|
2950
|
-
return
|
3037
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
2951
3038
|
}
|
2952
3039
|
|
2953
|
-
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3040
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2954
3041
|
|
2955
|
-
__shared__ int tile_x_ql[
|
2956
|
-
__shared__ half2 tile_x_dm[
|
2957
|
-
__shared__ int
|
2958
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
3042
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3043
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
3044
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2959
3045
|
|
2960
3046
|
*x_ql = tile_x_ql;
|
2961
3047
|
*x_dm = tile_x_dm;
|
2962
|
-
*x_qh = tile_x_qh;
|
2963
3048
|
*x_sc = tile_x_sc;
|
2964
3049
|
}
|
2965
3050
|
|
2966
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3051
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
3052
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
3053
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2969
3054
|
|
2970
3055
|
__builtin_assume(i_offset >= 0);
|
2971
|
-
__builtin_assume(i_offset <
|
3056
|
+
__builtin_assume(i_offset < nwarps);
|
2972
3057
|
__builtin_assume(k >= 0);
|
2973
3058
|
__builtin_assume(k < WARP_SIZE);
|
2974
3059
|
|
@@ -2978,7 +3063,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2978
3063
|
const block_q6_K * bx0 = (block_q6_K *) vx;
|
2979
3064
|
|
2980
3065
|
#pragma unroll
|
2981
|
-
for (int i0 = 0; i0 <
|
3066
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2982
3067
|
int i = i0 + i_offset;
|
2983
3068
|
|
2984
3069
|
if (need_check) {
|
@@ -2986,42 +3071,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2986
3071
|
}
|
2987
3072
|
|
2988
3073
|
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
3074
|
+
const int ky = QR6_K*kqsx;
|
2989
3075
|
|
2990
|
-
|
2991
|
-
|
2992
|
-
|
2993
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2995
|
-
|
2996
|
-
#pragma unroll
|
2997
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
-
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
3076
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
3077
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
3078
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2999
3079
|
|
3000
|
-
|
3001
|
-
|
3002
|
-
|
3080
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
3081
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
3082
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
3003
3083
|
|
3004
|
-
const
|
3084
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
3085
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
3005
3086
|
|
3006
|
-
|
3087
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
3088
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
3007
3089
|
}
|
3008
3090
|
|
3091
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
3092
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
3093
|
+
float * x_dmf = (float *) x_dm;
|
3094
|
+
|
3009
3095
|
#pragma unroll
|
3010
|
-
for (int i0 = 0; i0 <
|
3011
|
-
int i = i0 + i_offset *
|
3096
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
3097
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
3012
3098
|
|
3013
3099
|
if (need_check) {
|
3014
3100
|
i = min(i, i_max);
|
3015
3101
|
}
|
3016
3102
|
|
3017
|
-
const block_q6_K * bxi = bx0 + i*blocks_per_row +
|
3103
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3018
3104
|
|
3019
|
-
|
3105
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
3020
3106
|
}
|
3021
3107
|
|
3022
3108
|
#pragma unroll
|
3023
|
-
for (int i0 = 0; i0 <
|
3024
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
3109
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3110
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
3025
3111
|
|
3026
3112
|
if (need_check) {
|
3027
3113
|
i = min(i, i_max);
|
@@ -3037,39 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3037
3123
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
3124
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3039
3125
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
__builtin_assume(j >= 0);
|
3043
|
-
__builtin_assume(j < WARP_SIZE);
|
3044
|
-
__builtin_assume(k >= 0);
|
3045
|
-
__builtin_assume(k < WARP_SIZE);
|
3046
|
-
|
3047
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3049
|
-
|
3050
|
-
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
-
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
-
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
3053
|
-
|
3054
|
-
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
|
3055
|
-
|
3056
|
-
const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
|
3057
|
-
const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
|
3126
|
+
const float * x_dmf = (const float *) x_dm;
|
3127
|
+
const float * y_df = (const float *) y_ds;
|
3058
3128
|
|
3059
|
-
|
3060
|
-
float d8[QR6_K];
|
3061
|
-
|
3062
|
-
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
-
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
-
u[l] = y_qs[kqsy];
|
3065
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
3066
|
-
}
|
3129
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
3067
3130
|
|
3068
|
-
|
3069
|
-
|
3131
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
3132
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
3133
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
3070
3134
|
}
|
3071
3135
|
|
3072
|
-
template <int qk, int qr, int qi, typename block_q_t,
|
3136
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3073
3137
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
3138
|
static __global__ void mul_mat_q(
|
3075
3139
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
@@ -3084,14 +3148,11 @@ static __global__ void mul_mat_q(
|
|
3084
3148
|
|
3085
3149
|
const int & ncols_dst = ncols_y;
|
3086
3150
|
|
3087
|
-
const int
|
3088
|
-
const int tid_y = threadIdx.y;
|
3089
|
-
|
3090
|
-
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3151
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
3091
3152
|
const int & row_x_0 = row_dst_0;
|
3092
|
-
const int row_dst = row_dst_0 +
|
3153
|
+
const int row_dst = row_dst_0 + threadIdx.x;
|
3093
3154
|
|
3094
|
-
const int col_dst_0 = blockIdx.y*
|
3155
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
3095
3156
|
const int & col_y_0 = col_dst_0;
|
3096
3157
|
|
3097
3158
|
int * tile_x_ql = nullptr;
|
@@ -3101,55 +3162,65 @@ static __global__ void mul_mat_q(
|
|
3101
3162
|
|
3102
3163
|
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
3103
3164
|
|
3104
|
-
|
3105
|
-
|
3106
|
-
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
-
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
3165
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3166
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3108
3167
|
|
3109
|
-
float sum[
|
3168
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3110
3169
|
|
3111
3170
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3112
3171
|
|
3113
3172
|
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
-
|
3173
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
3115
3174
|
|
3175
|
+
#pragma unroll
|
3116
3176
|
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
-
const int kqs = ir*WARP_SIZE +
|
3177
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
3118
3178
|
const int kbxd = kqs / QI8_1;
|
3119
3179
|
|
3120
|
-
|
3121
|
-
|
3180
|
+
#pragma unroll
|
3181
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
3182
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
3122
3183
|
|
3123
3184
|
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
3124
3185
|
|
3125
|
-
|
3186
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
3187
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
3126
3188
|
}
|
3127
|
-
}
|
3128
3189
|
|
3129
|
-
|
3130
|
-
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
3190
|
+
#pragma unroll
|
3191
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
3192
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
3193
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
3194
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3195
|
+
|
3196
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
3197
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
3198
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
3199
|
+
if (need_sum) {
|
3200
|
+
*dsi_dst = *dsi_src;
|
3201
|
+
} else {
|
3202
|
+
float * dfi_dst = (float *) dsi_dst;
|
3203
|
+
*dfi_dst = (*dsi_src).x;
|
3204
|
+
}
|
3205
|
+
}
|
3135
3206
|
|
3136
|
-
|
3207
|
+
__syncthreads();
|
3137
3208
|
|
3138
|
-
#
|
3139
|
-
|
3140
|
-
#endif // __CUDA_ARCH__ >= 700
|
3141
|
-
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3209
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
3210
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
3142
3211
|
#pragma unroll
|
3143
|
-
|
3212
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3144
3213
|
#pragma unroll
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3214
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3215
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
3216
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3217
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
3218
|
+
}
|
3148
3219
|
}
|
3149
3220
|
}
|
3150
|
-
}
|
3151
3221
|
|
3152
|
-
|
3222
|
+
__syncthreads();
|
3223
|
+
}
|
3153
3224
|
}
|
3154
3225
|
|
3155
3226
|
|
@@ -3157,15 +3228,15 @@ static __global__ void mul_mat_q(
|
|
3157
3228
|
return;
|
3158
3229
|
}
|
3159
3230
|
|
3160
|
-
for (int j = 0; j <
|
3161
|
-
const int col_dst = col_dst_0 + j +
|
3231
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3162
3233
|
|
3163
3234
|
if (col_dst >= ncols_dst) {
|
3164
3235
|
return;
|
3165
3236
|
}
|
3166
3237
|
|
3167
|
-
for (int i = 0; i <
|
3168
|
-
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/
|
3238
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
|
3169
3240
|
}
|
3170
3241
|
}
|
3171
3242
|
}
|
@@ -3780,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3780
3851
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3781
3852
|
const dim3 block_nums(1, block_num_y, 1);
|
3782
3853
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3783
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
3854
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
3784
3855
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3785
3856
|
}
|
3786
3857
|
|
@@ -3789,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3789
3860
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3790
3861
|
const dim3 block_nums(1, block_num_y, 1);
|
3791
3862
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3792
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
3863
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
3793
3864
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3794
3865
|
}
|
3795
3866
|
|
@@ -3798,7 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3798
3869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3799
3870
|
const dim3 block_nums(1, block_num_y, 1);
|
3800
3871
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3801
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
3872
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
3802
3873
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3803
3874
|
}
|
3804
3875
|
|
@@ -3807,7 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3807
3878
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3808
3879
|
const dim3 block_nums(1, block_num_y, 1);
|
3809
3880
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3810
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
3881
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
3811
3882
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3812
3883
|
}
|
3813
3884
|
|
@@ -3816,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3816
3887
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3817
3888
|
const dim3 block_nums(1, block_num_y, 1);
|
3818
3889
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3819
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
3890
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
3820
3891
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3821
3892
|
}
|
3822
3893
|
|
@@ -3867,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3867
3938
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
3939
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
3940
|
|
3870
|
-
|
3871
|
-
|
3872
|
-
const
|
3873
|
-
|
3874
|
-
|
3875
|
-
|
3876
|
-
|
3877
|
-
|
3941
|
+
int id;
|
3942
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3943
|
+
const int compute_capability = g_compute_capabilities[id];
|
3944
|
+
|
3945
|
+
if (compute_capability >= CC_TURING) {
|
3946
|
+
const int mmq_x = 64;
|
3947
|
+
const int mmq_y = 128;
|
3948
|
+
const int nwarps = 4;
|
3949
|
+
|
3950
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3951
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3952
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3953
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
+
|
3955
|
+
if (nrows_x % mmq_y == 0) {
|
3956
|
+
const bool need_check = false;
|
3957
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
+
} else {
|
3961
|
+
const bool need_check = true;
|
3962
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
+
}
|
3878
3966
|
} else {
|
3879
|
-
|
3880
|
-
|
3967
|
+
const int mmq_x = 64;
|
3968
|
+
const int mmq_y = 64;
|
3969
|
+
const int nwarps = 4;
|
3970
|
+
|
3971
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3972
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3973
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3974
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3975
|
+
|
3976
|
+
if (nrows_x % mmq_y == 0) {
|
3977
|
+
const bool need_check = false;
|
3978
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3979
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3980
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3981
|
+
} else {
|
3982
|
+
const bool need_check = true;
|
3983
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
}
|
3881
3987
|
}
|
3882
3988
|
}
|
3883
3989
|
|
@@ -3885,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3885
3991
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
3992
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
3993
|
|
3888
|
-
|
3889
|
-
|
3890
|
-
const
|
3891
|
-
|
3892
|
-
|
3893
|
-
|
3894
|
-
|
3895
|
-
|
3994
|
+
int id;
|
3995
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3996
|
+
const int compute_capability = g_compute_capabilities[id];
|
3997
|
+
|
3998
|
+
if (compute_capability >= CC_TURING) {
|
3999
|
+
const int mmq_x = 64;
|
4000
|
+
const int mmq_y = 128;
|
4001
|
+
const int nwarps = 4;
|
4002
|
+
|
4003
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4004
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4005
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4006
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
+
|
4008
|
+
if (nrows_x % mmq_y == 0) {
|
4009
|
+
const bool need_check = false;
|
4010
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
+
} else {
|
4014
|
+
const bool need_check = true;
|
4015
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
+
}
|
3896
4019
|
} else {
|
3897
|
-
|
3898
|
-
|
4020
|
+
const int mmq_x = 64;
|
4021
|
+
const int mmq_y = 64;
|
4022
|
+
const int nwarps = 8;
|
4023
|
+
|
4024
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
+
|
4029
|
+
if (nrows_x % mmq_y == 0) {
|
4030
|
+
const bool need_check = false;
|
4031
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
+
} else {
|
4035
|
+
const bool need_check = true;
|
4036
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
+
}
|
4040
|
+
|
3899
4041
|
}
|
3900
4042
|
}
|
3901
4043
|
|
@@ -3903,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
3903
4045
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
4046
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
4047
|
|
3906
|
-
|
3907
|
-
|
3908
|
-
const
|
3909
|
-
|
3910
|
-
|
3911
|
-
|
3912
|
-
|
3913
|
-
|
4048
|
+
int id;
|
4049
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4050
|
+
const int compute_capability = g_compute_capabilities[id];
|
4051
|
+
|
4052
|
+
if (compute_capability >= CC_TURING) {
|
4053
|
+
const int mmq_x = 128;
|
4054
|
+
const int mmq_y = 64;
|
4055
|
+
const int nwarps = 4;
|
4056
|
+
|
4057
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4058
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4059
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4060
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
+
|
4062
|
+
if (nrows_x % mmq_y == 0) {
|
4063
|
+
const bool need_check = false;
|
4064
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
+
} else {
|
4068
|
+
const bool need_check = true;
|
4069
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
+
}
|
3914
4073
|
} else {
|
3915
|
-
|
3916
|
-
|
4074
|
+
const int mmq_x = 64;
|
4075
|
+
const int mmq_y = 64;
|
4076
|
+
const int nwarps = 8;
|
4077
|
+
|
4078
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4079
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4080
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4081
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4082
|
+
|
4083
|
+
if (nrows_x % mmq_y == 0) {
|
4084
|
+
const bool need_check = false;
|
4085
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4086
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4087
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4088
|
+
} else {
|
4089
|
+
const bool need_check = true;
|
4090
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
+
}
|
3917
4094
|
}
|
3918
4095
|
}
|
3919
4096
|
|
@@ -3921,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
3921
4098
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
4099
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
4100
|
|
3924
|
-
|
3925
|
-
|
3926
|
-
const
|
3927
|
-
|
3928
|
-
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
4101
|
+
int id;
|
4102
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4103
|
+
const int compute_capability = g_compute_capabilities[id];
|
4104
|
+
|
4105
|
+
if (compute_capability >= CC_TURING) {
|
4106
|
+
const int mmq_x = 128;
|
4107
|
+
const int mmq_y = 64;
|
4108
|
+
const int nwarps = 8;
|
4109
|
+
|
4110
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4111
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4112
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4113
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
+
|
4115
|
+
if (nrows_x % mmq_y == 0) {
|
4116
|
+
const bool need_check = false;
|
4117
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
+
} else {
|
4121
|
+
const bool need_check = true;
|
4122
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
+
}
|
3932
4126
|
} else {
|
3933
|
-
|
3934
|
-
|
4127
|
+
const int mmq_x = 64;
|
4128
|
+
const int mmq_y = 64;
|
4129
|
+
const int nwarps = 8;
|
4130
|
+
|
4131
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4132
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4133
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4134
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4135
|
+
|
4136
|
+
if (nrows_x % mmq_y == 0) {
|
4137
|
+
const bool need_check = false;
|
4138
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4139
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4140
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4141
|
+
} else {
|
4142
|
+
const bool need_check = true;
|
4143
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
+
}
|
3935
4147
|
}
|
3936
4148
|
}
|
3937
4149
|
|
@@ -3939,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
3939
4151
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
4152
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
4153
|
|
3942
|
-
|
3943
|
-
|
3944
|
-
const
|
3945
|
-
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
4154
|
+
int id;
|
4155
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4156
|
+
const int compute_capability = g_compute_capabilities[id];
|
4157
|
+
|
4158
|
+
if (compute_capability >= CC_TURING) {
|
4159
|
+
const int mmq_x = 128;
|
4160
|
+
const int mmq_y = 64;
|
4161
|
+
const int nwarps = 4;
|
4162
|
+
|
4163
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4164
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4165
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4166
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
+
|
4168
|
+
if (nrows_x % mmq_y == 0) {
|
4169
|
+
const bool need_check = false;
|
4170
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
+
} else {
|
4174
|
+
const bool need_check = true;
|
4175
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
+
}
|
3950
4179
|
} else {
|
3951
|
-
|
3952
|
-
|
4180
|
+
const int mmq_x = 64;
|
4181
|
+
const int mmq_y = 64;
|
4182
|
+
const int nwarps = 8;
|
4183
|
+
|
4184
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4185
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4186
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4187
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4188
|
+
|
4189
|
+
if (nrows_x % mmq_y == 0) {
|
4190
|
+
const bool need_check = false;
|
4191
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4192
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4193
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4194
|
+
} else {
|
4195
|
+
const bool need_check = true;
|
4196
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
+
}
|
3953
4200
|
}
|
3954
4201
|
}
|
3955
4202
|
|
@@ -3957,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
3957
4204
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
4205
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
4206
|
|
3960
|
-
|
3961
|
-
|
3962
|
-
const
|
3963
|
-
|
3964
|
-
|
3965
|
-
|
3966
|
-
|
3967
|
-
|
4207
|
+
int id;
|
4208
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4209
|
+
const int compute_capability = g_compute_capabilities[id];
|
4210
|
+
|
4211
|
+
if (compute_capability >= CC_TURING) {
|
4212
|
+
const int mmq_x = 64;
|
4213
|
+
const int mmq_y = 128;
|
4214
|
+
const int nwarps = 4;
|
4215
|
+
|
4216
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4217
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4218
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4219
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
+
|
4221
|
+
if (nrows_x % mmq_y == 0) {
|
4222
|
+
const bool need_check = false;
|
4223
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
+
} else {
|
4227
|
+
const bool need_check = true;
|
4228
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
+
}
|
3968
4232
|
} else {
|
3969
|
-
|
3970
|
-
|
4233
|
+
const int mmq_x = 64;
|
4234
|
+
const int mmq_y = 64;
|
4235
|
+
const int nwarps = 8;
|
4236
|
+
|
4237
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4238
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4239
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4240
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4241
|
+
|
4242
|
+
if (nrows_x % mmq_y == 0) {
|
4243
|
+
const bool need_check = false;
|
4244
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4245
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4246
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4247
|
+
} else {
|
4248
|
+
const bool need_check = true;
|
4249
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
+
}
|
3971
4253
|
}
|
3972
4254
|
}
|
3973
4255
|
|
@@ -3975,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
3975
4257
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
4258
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
4259
|
|
3978
|
-
|
3979
|
-
|
3980
|
-
const
|
3981
|
-
|
3982
|
-
|
3983
|
-
|
3984
|
-
|
3985
|
-
|
4260
|
+
int id;
|
4261
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4262
|
+
const int compute_capability = g_compute_capabilities[id];
|
4263
|
+
|
4264
|
+
if (compute_capability >= CC_TURING) {
|
4265
|
+
const int mmq_x = 128;
|
4266
|
+
const int mmq_y = 128;
|
4267
|
+
const int nwarps = 4;
|
4268
|
+
|
4269
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4270
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4271
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4272
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
+
|
4274
|
+
if (nrows_x % mmq_y == 0) {
|
4275
|
+
const bool need_check = false;
|
4276
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
+
} else {
|
4280
|
+
const bool need_check = true;
|
4281
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
+
}
|
3986
4285
|
} else {
|
3987
|
-
|
3988
|
-
|
4286
|
+
const int mmq_x = 64;
|
4287
|
+
const int mmq_y = 64;
|
4288
|
+
const int nwarps = 8;
|
4289
|
+
|
4290
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4291
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4292
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4293
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4294
|
+
|
4295
|
+
if (nrows_x % mmq_y == 0) {
|
4296
|
+
const bool need_check = false;
|
4297
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4298
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4299
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4300
|
+
} else {
|
4301
|
+
const bool need_check = true;
|
4302
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
+
}
|
3989
4306
|
}
|
3990
4307
|
}
|
3991
4308
|
|
@@ -3993,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
3993
4310
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
4311
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
4312
|
|
3996
|
-
|
3997
|
-
|
3998
|
-
const
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4313
|
+
int id;
|
4314
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4315
|
+
const int compute_capability = g_compute_capabilities[id];
|
4316
|
+
|
4317
|
+
if (compute_capability >= CC_TURING) {
|
4318
|
+
const int mmq_x = 64;
|
4319
|
+
const int mmq_y = 128;
|
4320
|
+
const int nwarps = 4;
|
4321
|
+
|
4322
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4323
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4324
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4325
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
+
|
4327
|
+
if (nrows_x % mmq_y == 0) {
|
4328
|
+
const bool need_check = false;
|
4329
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
+
} else {
|
4333
|
+
const bool need_check = true;
|
4334
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
+
}
|
4004
4338
|
} else {
|
4005
|
-
|
4006
|
-
|
4339
|
+
const int mmq_x = 32;
|
4340
|
+
const int mmq_y = 64;
|
4341
|
+
const int nwarps = 8;
|
4342
|
+
|
4343
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4344
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4345
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4346
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4347
|
+
|
4348
|
+
if (nrows_x % mmq_y == 0) {
|
4349
|
+
const bool need_check = false;
|
4350
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4351
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4352
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4353
|
+
} else {
|
4354
|
+
const bool need_check = true;
|
4355
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
+
}
|
4007
4359
|
}
|
4008
4360
|
}
|
4009
4361
|
|
@@ -4011,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4011
4363
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
4364
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
4365
|
|
4014
|
-
|
4015
|
-
|
4016
|
-
const
|
4017
|
-
|
4018
|
-
|
4019
|
-
|
4020
|
-
|
4021
|
-
|
4366
|
+
int id;
|
4367
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4368
|
+
const int compute_capability = g_compute_capabilities[id];
|
4369
|
+
|
4370
|
+
if (compute_capability >= CC_TURING) {
|
4371
|
+
const int mmq_x = 64;
|
4372
|
+
const int mmq_y = 128;
|
4373
|
+
const int nwarps = 4;
|
4374
|
+
|
4375
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4376
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4377
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4378
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
+
|
4380
|
+
if (nrows_x % mmq_y == 0) {
|
4381
|
+
const bool need_check = false;
|
4382
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
+
} else {
|
4386
|
+
const bool need_check = true;
|
4387
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
+
}
|
4022
4391
|
} else {
|
4023
|
-
|
4024
|
-
|
4392
|
+
const int mmq_x = 64;
|
4393
|
+
const int mmq_y = 64;
|
4394
|
+
const int nwarps = 8;
|
4395
|
+
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4398
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4400
|
+
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4404
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4405
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4406
|
+
} else {
|
4407
|
+
const bool need_check = true;
|
4408
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
+
}
|
4025
4412
|
}
|
4026
4413
|
}
|
4027
4414
|
|
@@ -4029,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4029
4416
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
4417
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
4418
|
|
4032
|
-
|
4033
|
-
|
4034
|
-
const
|
4035
|
-
|
4036
|
-
|
4037
|
-
|
4038
|
-
|
4039
|
-
|
4419
|
+
int id;
|
4420
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4421
|
+
const int compute_capability = g_compute_capabilities[id];
|
4422
|
+
|
4423
|
+
if (compute_capability >= CC_TURING) {
|
4424
|
+
const int mmq_x = 64;
|
4425
|
+
const int mmq_y = 64;
|
4426
|
+
const int nwarps = 4;
|
4427
|
+
|
4428
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4429
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4430
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4431
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
+
|
4433
|
+
if (nrows_x % mmq_y == 0) {
|
4434
|
+
const bool need_check = false;
|
4435
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
+
} else {
|
4439
|
+
const bool need_check = true;
|
4440
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
+
}
|
4040
4444
|
} else {
|
4041
|
-
|
4042
|
-
|
4445
|
+
const int mmq_x = 32;
|
4446
|
+
const int mmq_y = 64;
|
4447
|
+
const int nwarps = 8;
|
4448
|
+
|
4449
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4450
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4451
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4452
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4453
|
+
|
4454
|
+
if (nrows_x % mmq_y == 0) {
|
4455
|
+
const bool need_check = false;
|
4456
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4457
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4458
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4459
|
+
} else {
|
4460
|
+
const bool need_check = true;
|
4461
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
+
}
|
4043
4465
|
}
|
4044
4466
|
}
|
4045
4467
|
|
@@ -4214,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
4214
4636
|
}
|
4215
4637
|
|
4216
4638
|
|
4217
|
-
static void * g_scratch_buffer = nullptr;
|
4218
|
-
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
4219
|
-
static size_t g_scratch_offset = 0;
|
4220
|
-
|
4221
|
-
static int g_device_count = -1;
|
4222
|
-
static int g_main_device = 0;
|
4223
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
4224
|
-
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
-
static bool g_mul_mat_q = false;
|
4226
|
-
|
4227
|
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
4228
|
-
|
4229
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
4230
|
-
|
4231
4639
|
void ggml_init_cublas() {
|
4232
4640
|
static bool initialized = false;
|
4233
4641
|
|
@@ -4583,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
4583
4991
|
(void) i1;
|
4584
4992
|
}
|
4585
4993
|
|
4994
|
+
static int64_t get_row_rounding(ggml_type type) {
|
4995
|
+
int max_compute_capability = INT_MIN;
|
4996
|
+
for (int id = 0; id < g_device_count; ++id) {
|
4997
|
+
if (max_compute_capability < g_compute_capabilities[id]
|
4998
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
4999
|
+
max_compute_capability = g_compute_capabilities[id];
|
5000
|
+
}
|
5001
|
+
}
|
5002
|
+
|
5003
|
+
switch(type) {
|
5004
|
+
case GGML_TYPE_Q4_0:
|
5005
|
+
case GGML_TYPE_Q4_1:
|
5006
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5007
|
+
case GGML_TYPE_Q5_0:
|
5008
|
+
case GGML_TYPE_Q5_1:
|
5009
|
+
case GGML_TYPE_Q8_0:
|
5010
|
+
return 64;
|
5011
|
+
case GGML_TYPE_F16:
|
5012
|
+
return 1;
|
5013
|
+
case GGML_TYPE_Q2_K:
|
5014
|
+
case GGML_TYPE_Q3_K:
|
5015
|
+
case GGML_TYPE_Q4_K:
|
5016
|
+
case GGML_TYPE_Q5_K:
|
5017
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5018
|
+
case GGML_TYPE_Q6_K:
|
5019
|
+
return 64;
|
5020
|
+
default:
|
5021
|
+
GGML_ASSERT(false);
|
5022
|
+
}
|
5023
|
+
}
|
5024
|
+
|
4586
5025
|
inline void ggml_cuda_op_mul_mat_vec(
|
4587
5026
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4588
5027
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -4983,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
4983
5422
|
|
4984
5423
|
int64_t row_low, row_high;
|
4985
5424
|
if (split) {
|
5425
|
+
const int64_t rounding = get_row_rounding(src0->type);
|
5426
|
+
|
4986
5427
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
4987
|
-
row_low -= row_low %
|
5428
|
+
row_low -= row_low % rounding;
|
4988
5429
|
|
4989
5430
|
if (id == g_device_count - 1) {
|
4990
5431
|
row_high = nrows0;
|
4991
5432
|
} else {
|
4992
5433
|
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
-
row_high -= row_high %
|
5434
|
+
row_high -= row_high % rounding;
|
4994
5435
|
}
|
4995
5436
|
} else {
|
4996
5437
|
row_low = 0;
|
@@ -5203,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
5203
5644
|
if (split && g_device_count > 1) {
|
5204
5645
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5205
5646
|
for (int id = 0; id < g_device_count; ++id) {
|
5206
|
-
if (id != g_main_device) {
|
5647
|
+
if (id != g_main_device && src0_extra->events[id]) {
|
5207
5648
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
5208
5649
|
}
|
5209
5650
|
}
|
@@ -5347,7 +5788,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
5347
5788
|
} else {
|
5348
5789
|
int min_compute_capability = INT_MAX;
|
5349
5790
|
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
5791
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
5792
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5351
5793
|
min_compute_capability = g_compute_capabilities[id];
|
5352
5794
|
}
|
5353
5795
|
}
|
@@ -5468,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
5468
5910
|
row_low = 0;
|
5469
5911
|
row_high = nrows;
|
5470
5912
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
5913
|
+
const int64_t rounding = get_row_rounding(tensor->type);
|
5914
|
+
|
5471
5915
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
5472
|
-
row_low -= row_low %
|
5916
|
+
row_low -= row_low % rounding;
|
5473
5917
|
|
5474
5918
|
if (id == g_device_count - 1) {
|
5475
5919
|
row_high = nrows;
|
5476
5920
|
} else {
|
5477
5921
|
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
-
row_high -= row_high %
|
5922
|
+
row_high -= row_high % rounding;
|
5479
5923
|
}
|
5480
5924
|
} else {
|
5481
5925
|
GGML_ASSERT(false);
|