llama_cpp 0.3.6 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +44 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1398 -702
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +112 -146
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +51 -9
- data/ext/llama_cpp/src/llama.cpp +390 -210
- data/ext/llama_cpp/src/llama.h +20 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
16
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
#define CC_TURING 700
|
17
18
|
|
18
19
|
#if defined(_MSC_VER)
|
19
20
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
262
263
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
263
264
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
264
265
|
|
265
|
-
#ifndef GGML_CUDA_MMQ_Y
|
266
|
-
#define GGML_CUDA_MMQ_Y 64
|
267
|
-
#endif // GGML_CUDA_MMQ_Y
|
268
|
-
|
269
266
|
// dmmv = dequantize_mul_mat_vec
|
270
267
|
#ifndef GGML_CUDA_DMMV_X
|
271
268
|
#define GGML_CUDA_DMMV_X 32
|
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
|
|
285
282
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
286
283
|
};
|
287
284
|
|
285
|
+
static int g_device_count = -1;
|
286
|
+
static int g_main_device = 0;
|
287
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
|
+
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
+
static bool g_mul_mat_q = false;
|
290
|
+
|
291
|
+
static void * g_scratch_buffer = nullptr;
|
292
|
+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
293
|
+
static size_t g_scratch_offset = 0;
|
294
|
+
|
295
|
+
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
296
|
+
|
297
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
298
|
+
|
288
299
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
289
300
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
290
301
|
|
@@ -1383,9 +1394,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1383
1394
|
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
1395
|
}
|
1385
1396
|
|
1397
|
+
const float2 ds8f = __half22float2(ds8);
|
1398
|
+
|
1386
1399
|
// second part effectively subtracts 8 from each quant value
|
1387
|
-
return d4 * (sumi *
|
1400
|
+
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1388
1401
|
#else
|
1402
|
+
assert(false);
|
1389
1403
|
return 0.0f; // only to satisfy the compiler
|
1390
1404
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1405
|
}
|
@@ -1410,17 +1424,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1410
1424
|
}
|
1411
1425
|
|
1412
1426
|
#ifdef GGML_CUDA_F16
|
1413
|
-
const
|
1414
|
-
const float d4d8 =
|
1415
|
-
const float m4s8 =
|
1427
|
+
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
1428
|
+
const float d4d8 = tmp.x;
|
1429
|
+
const float m4s8 = tmp.y;
|
1416
1430
|
#else
|
1417
|
-
const
|
1418
|
-
const
|
1431
|
+
const float2 dm4f = __half22float2(dm4);
|
1432
|
+
const float2 ds8f = __half22float2(ds8);
|
1433
|
+
const float d4d8 = dm4f.x * ds8f.x;
|
1434
|
+
const float m4s8 = dm4f.y * ds8f.y;
|
1419
1435
|
#endif // GGML_CUDA_F16
|
1420
1436
|
|
1421
1437
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1422
1438
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1423
1439
|
#else
|
1440
|
+
assert(false);
|
1424
1441
|
return 0.0f; // only to satisfy the compiler
|
1425
1442
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1426
1443
|
}
|
@@ -1434,6 +1451,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1434
1451
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1435
1452
|
int sumi = 0;
|
1436
1453
|
|
1454
|
+
#pragma unroll
|
1437
1455
|
for (int i = 0; i < vdr; ++i) {
|
1438
1456
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
1457
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1450,9 +1468,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1450
1468
|
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
1469
|
}
|
1452
1470
|
|
1471
|
+
const float2 ds8f = __half22float2(ds8);
|
1472
|
+
|
1453
1473
|
// second part effectively subtracts 16 from each quant value
|
1454
|
-
return d5 * (sumi*
|
1474
|
+
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1455
1475
|
#else
|
1476
|
+
assert(false);
|
1456
1477
|
return 0.0f; // only to satisfy the compiler
|
1457
1478
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1458
1479
|
}
|
@@ -1466,6 +1487,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1466
1487
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1467
1488
|
int sumi = 0;
|
1468
1489
|
|
1490
|
+
#pragma unroll
|
1469
1491
|
for (int i = 0; i < vdr; ++i) {
|
1470
1492
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
1493
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1483,18 +1505,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1483
1505
|
}
|
1484
1506
|
|
1485
1507
|
#ifdef GGML_CUDA_F16
|
1486
|
-
const
|
1487
|
-
const float d5d8 =
|
1488
|
-
const float m5s8 =
|
1508
|
+
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
1509
|
+
const float d5d8 = tmp.x;
|
1510
|
+
const float m5s8 = tmp.y;
|
1489
1511
|
#else
|
1490
|
-
const
|
1491
|
-
const
|
1512
|
+
const float2 dm5f = __half22float2(dm5);
|
1513
|
+
const float2 ds8f = __half22float2(ds8);
|
1514
|
+
const float d5d8 = dm5f.x * ds8f.x;
|
1515
|
+
const float m5s8 = dm5f.y * ds8f.y;
|
1492
1516
|
#endif // GGML_CUDA_F16
|
1493
1517
|
|
1494
1518
|
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1495
1519
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1496
1520
|
|
1497
1521
|
#else
|
1522
|
+
assert(false);
|
1498
1523
|
return 0.0f; // only to satisfy the compiler
|
1499
1524
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1500
1525
|
}
|
@@ -1503,18 +1528,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1503
1528
|
#define VDR_Q8_0_Q8_1_MMQ 8
|
1504
1529
|
|
1505
1530
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
-
const int * v, const int * u, const float & d8_0, const
|
1531
|
+
const int * v, const int * u, const float & d8_0, const float & d8_1) {
|
1507
1532
|
|
1508
1533
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
1534
|
int sumi = 0;
|
1510
1535
|
|
1536
|
+
#pragma unroll
|
1511
1537
|
for (int i = 0; i < vdr; ++i) {
|
1512
1538
|
// SIMD dot product of quantized values
|
1513
1539
|
sumi = __dp4a(v[i], u[i], sumi);
|
1514
1540
|
}
|
1515
1541
|
|
1516
|
-
return
|
1542
|
+
return d8_0*d8_1 * sumi;
|
1517
1543
|
#else
|
1544
|
+
assert(false);
|
1518
1545
|
return 0.0f; // only to satisfy the compiler
|
1519
1546
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1520
1547
|
}
|
@@ -1525,23 +1552,374 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1525
1552
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1526
1553
|
int sumi = 0;
|
1527
1554
|
|
1555
|
+
#pragma unroll
|
1528
1556
|
for (int i = 0; i < vdr; ++i) {
|
1529
1557
|
// SIMD dot product of quantized values
|
1530
1558
|
sumi = __dp4a(v[i], u[i], sumi);
|
1531
1559
|
}
|
1532
1560
|
|
1533
1561
|
#ifdef GGML_CUDA_F16
|
1534
|
-
const
|
1535
|
-
const float d8d8 =
|
1536
|
-
const float m8s8 =
|
1562
|
+
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
1563
|
+
const float d8d8 = tmp.x;
|
1564
|
+
const float m8s8 = tmp.y;
|
1537
1565
|
#else
|
1538
|
-
const
|
1539
|
-
const
|
1566
|
+
const float2 dm8f = __half22float2(dm8);
|
1567
|
+
const float2 ds8f = __half22float2(ds8);
|
1568
|
+
const float d8d8 = dm8f.x * ds8f.x;
|
1569
|
+
const float m8s8 = dm8f.y * ds8f.y;
|
1540
1570
|
#endif // GGML_CUDA_F16
|
1541
1571
|
|
1542
1572
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1543
1573
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1544
1574
|
#else
|
1575
|
+
assert(false);
|
1576
|
+
return 0.0f; // only to satisfy the compiler
|
1577
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1578
|
+
}
|
1579
|
+
|
1580
|
+
#define VDR_Q2_K_Q8_1_MMVQ 1
|
1581
|
+
#define VDR_Q2_K_Q8_1_MMQ 2
|
1582
|
+
|
1583
|
+
// contiguous v/x values
|
1584
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
1585
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1586
|
+
const half2 & dm2, const float * __restrict__ d8) {
|
1587
|
+
|
1588
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1589
|
+
float sumf_d = 0.0f;
|
1590
|
+
float sumf_m = 0.0f;
|
1591
|
+
|
1592
|
+
#pragma unroll
|
1593
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1594
|
+
const int sc = scales[2*i];
|
1595
|
+
|
1596
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1597
|
+
|
1598
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
1599
|
+
|
1600
|
+
// fill int with 4x m
|
1601
|
+
int m = sc >> 4;
|
1602
|
+
m |= m << 8;
|
1603
|
+
m |= m << 16;
|
1604
|
+
sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
const float2 dm2f = __half22float2(dm2);
|
1608
|
+
|
1609
|
+
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1610
|
+
#else
|
1611
|
+
assert(false);
|
1612
|
+
return 0.0f; // only to satisfy the compiler
|
1613
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1614
|
+
}
|
1615
|
+
|
1616
|
+
// contiguous u/y values
|
1617
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
1618
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1619
|
+
const half2 & dm2, const float & d8) {
|
1620
|
+
|
1621
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1622
|
+
int sumi_d = 0;
|
1623
|
+
int sumi_m = 0;
|
1624
|
+
|
1625
|
+
#pragma unroll
|
1626
|
+
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
1627
|
+
int sumi_d_sc = 0;
|
1628
|
+
|
1629
|
+
const int sc = scales[i0 / (QI8_1/2)];
|
1630
|
+
|
1631
|
+
// fill int with 4x m
|
1632
|
+
int m = sc >> 4;
|
1633
|
+
m |= m << 8;
|
1634
|
+
m |= m << 16;
|
1635
|
+
|
1636
|
+
#pragma unroll
|
1637
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1638
|
+
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
1639
|
+
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
1640
|
+
}
|
1641
|
+
|
1642
|
+
sumi_d += sumi_d_sc * (sc & 0xF);
|
1643
|
+
}
|
1644
|
+
|
1645
|
+
const float2 dm2f = __half22float2(dm2);
|
1646
|
+
|
1647
|
+
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1648
|
+
#else
|
1649
|
+
assert(false);
|
1650
|
+
return 0.0f; // only to satisfy the compiler
|
1651
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1652
|
+
}
|
1653
|
+
|
1654
|
+
#define VDR_Q3_K_Q8_1_MMVQ 1
|
1655
|
+
#define VDR_Q3_K_Q8_1_MMQ 2
|
1656
|
+
|
1657
|
+
// contiguous v/x values
|
1658
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
1659
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1660
|
+
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
1661
|
+
|
1662
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1663
|
+
float sumf = 0.0f;
|
1664
|
+
|
1665
|
+
#pragma unroll
|
1666
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1667
|
+
const int isc = scale_offset + 2*i;
|
1668
|
+
|
1669
|
+
const int isc_low = isc % (QK_K/32);
|
1670
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1671
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
1672
|
+
|
1673
|
+
const int isc_high = isc % (QK_K/64);
|
1674
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1675
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1676
|
+
|
1677
|
+
const int sc = (sc_low | sc_high) - 32;
|
1678
|
+
|
1679
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1680
|
+
|
1681
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1682
|
+
|
1683
|
+
const int vi = __vsubss4(vil, vih);
|
1684
|
+
|
1685
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1686
|
+
}
|
1687
|
+
|
1688
|
+
return d3 * sumf;
|
1689
|
+
#else
|
1690
|
+
assert(false);
|
1691
|
+
return 0.0f; // only to satisfy the compiler
|
1692
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1693
|
+
}
|
1694
|
+
|
1695
|
+
// contiguous u/y values
|
1696
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
1697
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1698
|
+
const float & d3, const float & d8) {
|
1699
|
+
|
1700
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1701
|
+
int sumi = 0;
|
1702
|
+
|
1703
|
+
#pragma unroll
|
1704
|
+
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
1705
|
+
int sumi_sc = 0;
|
1706
|
+
|
1707
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1708
|
+
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
1712
|
+
}
|
1713
|
+
|
1714
|
+
return d3*d8 * sumi;
|
1715
|
+
#else
|
1716
|
+
assert(false);
|
1717
|
+
return 0.0f; // only to satisfy the compiler
|
1718
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
#define VDR_Q4_K_Q8_1_MMVQ 2
|
1722
|
+
#define VDR_Q4_K_Q8_1_MMQ 8
|
1723
|
+
|
1724
|
+
// contiguous v/x values
|
1725
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
1726
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1727
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
1728
|
+
|
1729
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1730
|
+
float sumf_d = 0.0f;
|
1731
|
+
float sumf_m = 0.0f;
|
1732
|
+
|
1733
|
+
#pragma unroll
|
1734
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1735
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
1736
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
1737
|
+
|
1738
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
1739
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
1740
|
+
|
1741
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1742
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1743
|
+
}
|
1744
|
+
|
1745
|
+
const float2 dm4f = __half22float2(dm4);
|
1746
|
+
|
1747
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1748
|
+
|
1749
|
+
#else
|
1750
|
+
assert(false);
|
1751
|
+
return 0.0f; // only to satisfy the compiler
|
1752
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1753
|
+
}
|
1754
|
+
|
1755
|
+
// contiguous u/y values
|
1756
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1757
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1758
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1759
|
+
|
1760
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1761
|
+
float sumf_d = 0.0f;
|
1762
|
+
float sumf_m = 0.0f;
|
1763
|
+
|
1764
|
+
#pragma unroll
|
1765
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1766
|
+
int sumi_d = 0;
|
1767
|
+
|
1768
|
+
#pragma unroll
|
1769
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1770
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1771
|
+
}
|
1772
|
+
|
1773
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1774
|
+
|
1775
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1776
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1777
|
+
}
|
1778
|
+
|
1779
|
+
const float2 dm4f = __half22float2(dm4);
|
1780
|
+
|
1781
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1782
|
+
|
1783
|
+
#else
|
1784
|
+
assert(false);
|
1785
|
+
return 0.0f; // only to satisfy the compiler
|
1786
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1787
|
+
}
|
1788
|
+
|
1789
|
+
#define VDR_Q5_K_Q8_1_MMVQ 2
|
1790
|
+
#define VDR_Q5_K_Q8_1_MMQ 8
|
1791
|
+
|
1792
|
+
// contiguous v/x values
|
1793
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1794
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1795
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1796
|
+
|
1797
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1798
|
+
float sumf_d = 0.0f;
|
1799
|
+
float sumf_m = 0.0f;
|
1800
|
+
|
1801
|
+
#pragma unroll
|
1802
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1803
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
1804
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
1805
|
+
|
1806
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
1807
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
1808
|
+
|
1809
|
+
const int v0i = vl0i | vh0i;
|
1810
|
+
const int v1i = vl1i | vh1i;
|
1811
|
+
|
1812
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
1813
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
1814
|
+
|
1815
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1816
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
1817
|
+
|
1818
|
+
}
|
1819
|
+
|
1820
|
+
const float2 dm5f = __half22float2(dm5);
|
1821
|
+
|
1822
|
+
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1823
|
+
|
1824
|
+
#else
|
1825
|
+
assert(false);
|
1826
|
+
return 0.0f; // only to satisfy the compiler
|
1827
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
// contiguous u/y values
|
1831
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1832
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1833
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1834
|
+
|
1835
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1836
|
+
float sumf_d = 0.0f;
|
1837
|
+
float sumf_m = 0.0f;
|
1838
|
+
|
1839
|
+
#pragma unroll
|
1840
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1841
|
+
int sumi_d = 0;
|
1842
|
+
|
1843
|
+
#pragma unroll
|
1844
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1845
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1846
|
+
}
|
1847
|
+
|
1848
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1849
|
+
|
1850
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1851
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
const float2 dm4f = __half22float2(dm4);
|
1855
|
+
|
1856
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1857
|
+
|
1858
|
+
#else
|
1859
|
+
assert(false);
|
1860
|
+
return 0.0f; // only to satisfy the compiler
|
1861
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
#define VDR_Q6_K_Q8_1_MMVQ 1
|
1865
|
+
#define VDR_Q6_K_Q8_1_MMQ 8
|
1866
|
+
|
1867
|
+
// contiguous v/x values
|
1868
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
1869
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1870
|
+
const float & d, const float * __restrict__ d8) {
|
1871
|
+
|
1872
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1873
|
+
float sumf = 0.0f;
|
1874
|
+
|
1875
|
+
#pragma unroll
|
1876
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1877
|
+
const int sc = scales[4*i];
|
1878
|
+
|
1879
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1880
|
+
|
1881
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
1882
|
+
|
1883
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1884
|
+
|
1885
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1886
|
+
}
|
1887
|
+
|
1888
|
+
return d*sumf;
|
1889
|
+
#else
|
1890
|
+
assert(false);
|
1891
|
+
return 0.0f; // only to satisfy the compiler
|
1892
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
// contiguous u/y values
|
1896
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
1897
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
1898
|
+
const float & d6, const float * __restrict__ d8) {
|
1899
|
+
|
1900
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1901
|
+
float sumf_d = 0.0f;
|
1902
|
+
|
1903
|
+
#pragma unroll
|
1904
|
+
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
1905
|
+
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
1906
|
+
|
1907
|
+
#pragma unroll
|
1908
|
+
for (int i = i0; i < i0 + 2; ++i) {
|
1909
|
+
sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
1910
|
+
sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
1911
|
+
|
1912
|
+
sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
1913
|
+
sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
return d6 * sumf_d;
|
1920
|
+
|
1921
|
+
#else
|
1922
|
+
assert(false);
|
1545
1923
|
return 0.0f; // only to satisfy the compiler
|
1546
1924
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
1925
|
}
|
@@ -1564,21 +1942,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
1564
1942
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
1943
|
}
|
1566
1944
|
|
1567
|
-
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1945
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
1946
|
|
1569
|
-
__shared__ int tile_x_qs[
|
1570
|
-
__shared__ float tile_x_d[
|
1947
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
1948
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
1571
1949
|
|
1572
1950
|
*x_ql = tile_x_qs;
|
1573
1951
|
*x_dm = (half2 *) tile_x_d;
|
1574
1952
|
}
|
1575
1953
|
|
1576
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1954
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
1955
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
1956
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
1957
|
|
1580
1958
|
__builtin_assume(i_offset >= 0);
|
1581
|
-
__builtin_assume(i_offset <
|
1959
|
+
__builtin_assume(i_offset < nwarps);
|
1582
1960
|
__builtin_assume(k >= 0);
|
1583
1961
|
__builtin_assume(k < WARP_SIZE);
|
1584
1962
|
|
@@ -1590,7 +1968,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1590
1968
|
float * x_dmf = (float *) x_dm;
|
1591
1969
|
|
1592
1970
|
#pragma unroll
|
1593
|
-
for (int i0 = 0; i0 <
|
1971
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1594
1972
|
int i = i0 + i_offset;
|
1595
1973
|
|
1596
1974
|
if (need_check) {
|
@@ -1600,38 +1978,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1600
1978
|
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
1979
|
|
1602
1980
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
-
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1981
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1604
1982
|
}
|
1605
1983
|
|
1606
|
-
|
1607
|
-
|
1984
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1985
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1608
1986
|
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1987
|
+
#pragma unroll
|
1988
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
1989
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
1990
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1991
|
+
if (need_check) {
|
1992
|
+
i = min(i, i_max);
|
1993
|
+
}
|
1617
1994
|
|
1618
|
-
|
1995
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
1996
|
|
1620
|
-
|
1621
|
-
|
1997
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
1998
|
+
}
|
1622
1999
|
}
|
1623
2000
|
|
1624
2001
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
2002
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
2003
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1627
2004
|
|
1628
|
-
__builtin_assume(i >= 0);
|
1629
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
-
__builtin_assume(j >= 0);
|
1631
|
-
__builtin_assume(j < WARP_SIZE);
|
1632
|
-
__builtin_assume(k >= 0);
|
1633
|
-
__builtin_assume(k < WARP_SIZE);
|
1634
|
-
|
1635
2005
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
2006
|
const float * x_dmf = (float *) x_dm;
|
1637
2007
|
|
@@ -1639,13 +2009,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
|
1639
2009
|
|
1640
2010
|
#pragma unroll
|
1641
2011
|
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
-
u[2*l+0] = y_qs[j *
|
1643
|
-
u[2*l+1] = y_qs[j *
|
2012
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2013
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
1644
2014
|
}
|
1645
2015
|
|
1646
2016
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
2017
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
-
y_ds[j * (
|
2018
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1649
2019
|
}
|
1650
2020
|
|
1651
2021
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
@@ -1666,21 +2036,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
1666
2036
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
2037
|
}
|
1668
2038
|
|
1669
|
-
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2039
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1670
2040
|
|
1671
|
-
__shared__ int tile_x_qs[
|
1672
|
-
__shared__ half2 tile_x_dm[
|
2041
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2042
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
1673
2043
|
|
1674
2044
|
*x_ql = tile_x_qs;
|
1675
2045
|
*x_dm = tile_x_dm;
|
1676
2046
|
}
|
1677
2047
|
|
1678
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2048
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
2049
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
2050
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
2051
|
|
1682
2052
|
__builtin_assume(i_offset >= 0);
|
1683
|
-
__builtin_assume(i_offset <
|
2053
|
+
__builtin_assume(i_offset < nwarps);
|
1684
2054
|
__builtin_assume(k >= 0);
|
1685
2055
|
__builtin_assume(k < WARP_SIZE);
|
1686
2056
|
|
@@ -1690,7 +2060,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1690
2060
|
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1691
2061
|
|
1692
2062
|
#pragma unroll
|
1693
|
-
for (int i0 = 0; i0 <
|
2063
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1694
2064
|
int i = i0 + i_offset;
|
1695
2065
|
|
1696
2066
|
if (need_check) {
|
@@ -1706,7 +2076,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1706
2076
|
const int kbxd = k % blocks_per_tile_x_row;
|
1707
2077
|
|
1708
2078
|
#pragma unroll
|
1709
|
-
for (int i0 = 0; i0 <
|
2079
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
1710
2080
|
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
2081
|
|
1712
2082
|
if (need_check) {
|
@@ -1723,26 +2093,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
|
1723
2093
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
2094
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1725
2095
|
|
1726
|
-
__builtin_assume(i >= 0);
|
1727
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
-
__builtin_assume(j >= 0);
|
1729
|
-
__builtin_assume(j < WARP_SIZE);
|
1730
|
-
__builtin_assume(k >= 0);
|
1731
|
-
__builtin_assume(k < WARP_SIZE);
|
1732
|
-
|
1733
2096
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1734
2097
|
|
1735
2098
|
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1736
2099
|
|
1737
2100
|
#pragma unroll
|
1738
2101
|
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
-
u[2*l+0] = y_qs[j *
|
1740
|
-
u[2*l+1] = y_qs[j *
|
2102
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2103
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
1741
2104
|
}
|
1742
2105
|
|
1743
2106
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
2107
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
-
y_ds[j * (
|
2108
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1746
2109
|
}
|
1747
2110
|
|
1748
2111
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
@@ -1765,21 +2128,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
1765
2128
|
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
2129
|
}
|
1767
2130
|
|
1768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2131
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
2132
|
|
1770
|
-
__shared__ int tile_x_ql[
|
1771
|
-
__shared__ float tile_x_d[
|
2133
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2134
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
1772
2135
|
|
1773
2136
|
*x_ql = tile_x_ql;
|
1774
2137
|
*x_dm = (half2 *) tile_x_d;
|
1775
2138
|
}
|
1776
2139
|
|
1777
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2140
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
2141
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
2142
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
2143
|
|
1781
2144
|
__builtin_assume(i_offset >= 0);
|
1782
|
-
__builtin_assume(i_offset <
|
2145
|
+
__builtin_assume(i_offset < nwarps);
|
1783
2146
|
__builtin_assume(k >= 0);
|
1784
2147
|
__builtin_assume(k < WARP_SIZE);
|
1785
2148
|
|
@@ -1789,7 +2152,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1789
2152
|
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
2153
|
|
1791
2154
|
#pragma unroll
|
1792
|
-
for (int i0 = 0; i0 <
|
2155
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1793
2156
|
int i = i0 + i_offset;
|
1794
2157
|
|
1795
2158
|
if (need_check) {
|
@@ -1825,7 +2188,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1825
2188
|
float * x_dmf = (float *) x_dm;
|
1826
2189
|
|
1827
2190
|
#pragma unroll
|
1828
|
-
for (int i0 = 0; i0 <
|
2191
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
1829
2192
|
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
2193
|
|
1831
2194
|
if (need_check) {
|
@@ -1842,27 +2205,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
|
1842
2205
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
2206
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
2207
|
|
1845
|
-
__builtin_assume(i >= 0);
|
1846
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
-
__builtin_assume(j >= 0);
|
1848
|
-
__builtin_assume(j < WARP_SIZE);
|
1849
|
-
__builtin_assume(k >= 0);
|
1850
|
-
__builtin_assume(k < WARP_SIZE);
|
1851
|
-
|
1852
2208
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
2209
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
-
const float * x_dmf = (float *) x_dm;
|
2210
|
+
const float * x_dmf = (const float *) x_dm;
|
2211
|
+
const float * y_df = (const float *) y_ds;
|
1855
2212
|
|
1856
2213
|
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
2214
|
|
1858
2215
|
#pragma unroll
|
1859
2216
|
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
-
u[2*l+0] = y_qs[j *
|
1861
|
-
u[2*l+1] = y_qs[j *
|
2217
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2218
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
1862
2219
|
}
|
1863
2220
|
|
1864
2221
|
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
|
2222
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1866
2223
|
}
|
1867
2224
|
|
1868
2225
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
@@ -1885,21 +2242,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
1885
2242
|
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
2243
|
}
|
1887
2244
|
|
1888
|
-
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2245
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
2246
|
|
1890
|
-
__shared__ int tile_x_ql[
|
1891
|
-
__shared__ half2 tile_x_dm[
|
2247
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2248
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
1892
2249
|
|
1893
2250
|
*x_ql = tile_x_ql;
|
1894
2251
|
*x_dm = tile_x_dm;
|
1895
2252
|
}
|
1896
2253
|
|
1897
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2254
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
2255
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
2256
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
2257
|
|
1901
2258
|
__builtin_assume(i_offset >= 0);
|
1902
|
-
__builtin_assume(i_offset <
|
2259
|
+
__builtin_assume(i_offset < nwarps);
|
1903
2260
|
__builtin_assume(k >= 0);
|
1904
2261
|
__builtin_assume(k < WARP_SIZE);
|
1905
2262
|
|
@@ -1909,7 +2266,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1909
2266
|
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
2267
|
|
1911
2268
|
#pragma unroll
|
1912
|
-
for (int i0 = 0; i0 <
|
2269
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1913
2270
|
int i = i0 + i_offset;
|
1914
2271
|
|
1915
2272
|
if (need_check) {
|
@@ -1942,7 +2299,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1942
2299
|
const int kbxd = k % blocks_per_tile_x_row;
|
1943
2300
|
|
1944
2301
|
#pragma unroll
|
1945
|
-
for (int i0 = 0; i0 <
|
2302
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
1946
2303
|
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
2304
|
|
1948
2305
|
if (need_check) {
|
@@ -1959,13 +2316,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1959
2316
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
2317
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
2318
|
|
1962
|
-
__builtin_assume(i >= 0);
|
1963
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
-
__builtin_assume(j >= 0);
|
1965
|
-
__builtin_assume(j < WARP_SIZE);
|
1966
|
-
__builtin_assume(k >= 0);
|
1967
|
-
__builtin_assume(k < WARP_SIZE);
|
1968
|
-
|
1969
2319
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
2320
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
2321
|
|
@@ -1973,12 +2323,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1973
2323
|
|
1974
2324
|
#pragma unroll
|
1975
2325
|
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
-
u[2*l+0] = y_qs[j *
|
1977
|
-
u[2*l+1] = y_qs[j *
|
2326
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2327
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
1978
2328
|
}
|
1979
2329
|
|
1980
2330
|
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (
|
2331
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1982
2332
|
}
|
1983
2333
|
|
1984
2334
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
@@ -1989,29 +2339,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
1989
2339
|
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
2340
|
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
2341
|
|
2342
|
+
#pragma unroll
|
1992
2343
|
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
2344
|
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
2345
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
2346
|
}
|
1996
2347
|
|
1997
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
2348
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
1998
2349
|
}
|
1999
2350
|
|
2000
|
-
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2351
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
2352
|
|
2002
|
-
__shared__ int tile_x_qs[
|
2003
|
-
__shared__ float tile_x_d[
|
2353
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2354
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
2004
2355
|
|
2005
2356
|
*x_ql = tile_x_qs;
|
2006
2357
|
*x_dm = (half2 *) tile_x_d;
|
2007
2358
|
}
|
2008
2359
|
|
2009
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2360
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
2361
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
2362
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
2363
|
|
2013
2364
|
__builtin_assume(i_offset >= 0);
|
2014
|
-
__builtin_assume(i_offset <
|
2365
|
+
__builtin_assume(i_offset < nwarps);
|
2015
2366
|
__builtin_assume(k >= 0);
|
2016
2367
|
__builtin_assume(k < WARP_SIZE);
|
2017
2368
|
|
@@ -2022,7 +2373,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2022
2373
|
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
2374
|
|
2024
2375
|
#pragma unroll
|
2025
|
-
for (int i0 = 0; i0 <
|
2376
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2026
2377
|
int i = i0 + i_offset;
|
2027
2378
|
|
2028
2379
|
if (need_check) {
|
@@ -2032,76 +2383,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2032
2383
|
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
2384
|
|
2034
2385
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
-
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
2386
|
}
|
2037
2387
|
|
2038
|
-
|
2039
|
-
|
2388
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2389
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2040
2390
|
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2391
|
+
#pragma unroll
|
2392
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
2393
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
2394
|
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
// }
|
2050
|
-
// #endif // GGML_CUDA_MMQ_Y < 64
|
2395
|
+
if (need_check) {
|
2396
|
+
i = min(i, i_max);
|
2397
|
+
}
|
2051
2398
|
|
2052
|
-
|
2399
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
2400
|
|
2054
|
-
|
2055
|
-
|
2401
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
2402
|
+
}
|
2056
2403
|
}
|
2057
2404
|
|
2058
2405
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
2406
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
2407
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
2408
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
__builtin_assume(j >= 0);
|
2065
|
-
__builtin_assume(j < WARP_SIZE);
|
2066
|
-
__builtin_assume(k >= 0);
|
2067
|
-
__builtin_assume(k < WARP_SIZE);
|
2068
|
-
|
2069
|
-
const float * x_dmf = (float *) x_dm;
|
2409
|
+
const float * x_dmf = (const float *) x_dm;
|
2410
|
+
const float * y_df = (const float *) y_ds;
|
2070
2411
|
|
2071
2412
|
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
2413
|
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
-
|
2074
|
-
}
|
2075
|
-
|
2076
|
-
#define VDR_q2_K_q8_1 1
|
2077
|
-
|
2078
|
-
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
-
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
-
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
-
|
2082
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
-
float sumf_d = 0.0f;
|
2084
|
-
float sumf_m = 0.0f;
|
2085
|
-
|
2086
|
-
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
-
const int sc = scales[2*i];
|
2088
|
-
|
2089
|
-
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
-
|
2091
|
-
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
-
|
2093
|
-
int sc_high = sc >> 4;
|
2094
|
-
sc_high |= sc_high << 8;
|
2095
|
-
sc_high |= sc_high << 16;
|
2096
|
-
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
-
}
|
2098
|
-
|
2099
|
-
const float2 dmf = __half22float2(dm);
|
2100
|
-
|
2101
|
-
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
-
#else
|
2103
|
-
return 0.0f; // only to satisfy the compiler
|
2104
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2414
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2105
2415
|
}
|
2106
2416
|
|
2107
2417
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
@@ -2115,34 +2425,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2115
2425
|
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
2426
|
|
2117
2427
|
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
-
int
|
2428
|
+
int u[QR2_K];
|
2119
2429
|
float d8[QR2_K];
|
2120
2430
|
|
2431
|
+
#pragma unroll
|
2121
2432
|
for (int i = 0; i < QR2_K; ++ i) {
|
2122
2433
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
2434
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
2435
|
}
|
2125
2436
|
|
2126
|
-
return
|
2437
|
+
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
2127
2438
|
}
|
2128
2439
|
|
2129
|
-
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2440
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
2441
|
|
2131
|
-
__shared__ int tile_x_ql[
|
2132
|
-
__shared__ half2 tile_x_dm[
|
2133
|
-
__shared__ int tile_x_sc[
|
2442
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2443
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
2444
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2134
2445
|
|
2135
2446
|
*x_ql = tile_x_ql;
|
2136
2447
|
*x_dm = tile_x_dm;
|
2137
2448
|
*x_sc = tile_x_sc;
|
2138
2449
|
}
|
2139
2450
|
|
2140
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2451
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
2452
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
2453
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
2454
|
|
2144
2455
|
__builtin_assume(i_offset >= 0);
|
2145
|
-
__builtin_assume(i_offset <
|
2456
|
+
__builtin_assume(i_offset < nwarps);
|
2146
2457
|
__builtin_assume(k >= 0);
|
2147
2458
|
__builtin_assume(k < WARP_SIZE);
|
2148
2459
|
|
@@ -2152,7 +2463,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2152
2463
|
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
2464
|
|
2154
2465
|
#pragma unroll
|
2155
|
-
for (int i0 = 0; i0 <
|
2466
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2156
2467
|
int i = i0 + i_offset;
|
2157
2468
|
|
2158
2469
|
if (need_check) {
|
@@ -2168,8 +2479,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2168
2479
|
const int kbxd = k % blocks_per_tile_x_row;
|
2169
2480
|
|
2170
2481
|
#pragma unroll
|
2171
|
-
for (int i0 = 0; i0 <
|
2172
|
-
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) %
|
2482
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
2483
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
2173
2484
|
|
2174
2485
|
if (need_check) {
|
2175
2486
|
i = min(i, i_max);
|
@@ -2181,7 +2492,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2181
2492
|
}
|
2182
2493
|
|
2183
2494
|
#pragma unroll
|
2184
|
-
for (int i0 = 0; i0 <
|
2495
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2185
2496
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
2497
|
|
2187
2498
|
if (need_check) {
|
@@ -2198,68 +2509,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
|
2198
2509
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
2510
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
2511
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
__builtin_assume(j < WARP_SIZE);
|
2205
|
-
__builtin_assume(k >= 0);
|
2206
|
-
__builtin_assume(k < WARP_SIZE);
|
2207
|
-
|
2208
|
-
const int kbx = k / QI2_K;
|
2209
|
-
const int kqsx = k % QI2_K;
|
2210
|
-
|
2211
|
-
const int bq8_offset = QR2_K * (kqsx / QI8_1);
|
2212
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2512
|
+
const int kbx = k / QI2_K;
|
2513
|
+
const int ky = (k % QI2_K) * QR2_K;
|
2514
|
+
const float * y_df = (const float *) y_ds;
|
2213
2515
|
|
2214
|
-
|
2516
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
2215
2517
|
|
2216
|
-
int
|
2217
|
-
|
2518
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
2519
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
2218
2520
|
|
2219
|
-
|
2220
|
-
|
2221
|
-
|
2222
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2521
|
+
#pragma unroll
|
2522
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
2523
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2223
2524
|
}
|
2224
2525
|
|
2225
|
-
|
2226
|
-
}
|
2227
|
-
|
2228
|
-
#define VDR_q3_K_q8_1 1
|
2229
|
-
|
2230
|
-
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
-
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
-
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
-
|
2234
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
-
float sumf = 0.0f;
|
2236
|
-
|
2237
|
-
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
-
const int isc = scale_offset + 2*i;
|
2239
|
-
|
2240
|
-
const int isc_low = isc % (QK_K/32);
|
2241
|
-
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
2242
|
-
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
-
|
2244
|
-
const int isc_high = isc % (QK_K/64);
|
2245
|
-
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
-
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
-
|
2248
|
-
const int sc = (sc_low | sc_high) - 32;
|
2249
|
-
|
2250
|
-
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
-
|
2252
|
-
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
-
|
2254
|
-
const int vi = __vsubss4(vil, vih);
|
2526
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
2255
2527
|
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
return d*sumf;
|
2260
|
-
#else
|
2261
|
-
return 0.0f; // only to satisfy the compiler
|
2262
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2528
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
2529
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
2263
2530
|
}
|
2264
2531
|
|
2265
2532
|
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
@@ -2277,23 +2544,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2277
2544
|
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
2545
|
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
2546
|
|
2280
|
-
int
|
2547
|
+
int u[QR3_K];
|
2281
2548
|
float d8[QR3_K];
|
2282
2549
|
|
2550
|
+
#pragma unroll
|
2283
2551
|
for (int i = 0; i < QR3_K; ++i) {
|
2284
2552
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
2553
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
2554
|
}
|
2287
2555
|
|
2288
|
-
return
|
2556
|
+
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
2557
|
}
|
2290
2558
|
|
2291
|
-
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2559
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
2560
|
|
2293
|
-
__shared__ int tile_x_ql[
|
2294
|
-
__shared__ half2 tile_x_dm[
|
2295
|
-
__shared__ int tile_x_qh[
|
2296
|
-
__shared__ int tile_x_sc[
|
2561
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2562
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
2563
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
2564
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2297
2565
|
|
2298
2566
|
*x_ql = tile_x_ql;
|
2299
2567
|
*x_dm = tile_x_dm;
|
@@ -2301,12 +2569,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
|
|
2301
2569
|
*x_sc = tile_x_sc;
|
2302
2570
|
}
|
2303
2571
|
|
2304
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2572
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
2573
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
2574
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
2575
|
|
2308
2576
|
__builtin_assume(i_offset >= 0);
|
2309
|
-
__builtin_assume(i_offset <
|
2577
|
+
__builtin_assume(i_offset < nwarps);
|
2310
2578
|
__builtin_assume(k >= 0);
|
2311
2579
|
__builtin_assume(k < WARP_SIZE);
|
2312
2580
|
|
@@ -2316,7 +2584,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2316
2584
|
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
2585
|
|
2318
2586
|
#pragma unroll
|
2319
|
-
for (int i0 = 0; i0 <
|
2587
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2320
2588
|
int i = i0 + i_offset;
|
2321
2589
|
|
2322
2590
|
if (need_check) {
|
@@ -2330,10 +2598,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2330
2598
|
|
2331
2599
|
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
2600
|
const int kbxd = k % blocks_per_tile_x_row;
|
2601
|
+
float * x_dmf = (float *) x_dm;
|
2333
2602
|
|
2334
2603
|
#pragma unroll
|
2335
|
-
for (int i0 = 0; i0 <
|
2336
|
-
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) %
|
2604
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
2605
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
2337
2606
|
|
2338
2607
|
if (need_check) {
|
2339
2608
|
i = min(i, i_max);
|
@@ -2341,11 +2610,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2341
2610
|
|
2342
2611
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
2612
|
|
2344
|
-
|
2613
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
2345
2614
|
}
|
2346
2615
|
|
2347
2616
|
#pragma unroll
|
2348
|
-
for (int i0 = 0; i0 <
|
2617
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
2349
2618
|
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
2619
|
|
2351
2620
|
if (need_check) {
|
@@ -2354,11 +2623,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2354
2623
|
|
2355
2624
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
2625
|
|
2357
|
-
|
2626
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2627
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
2628
|
}
|
2359
2629
|
|
2360
2630
|
#pragma unroll
|
2361
|
-
for (int i0 = 0; i0 <
|
2631
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2362
2632
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
2633
|
|
2364
2634
|
if (need_check) {
|
@@ -2367,7 +2637,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2367
2637
|
|
2368
2638
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
2639
|
|
2370
|
-
|
2640
|
+
const int ksc = k % (QI3_K/4);
|
2641
|
+
|
2642
|
+
const int ksc_low = ksc % (QI3_K/8);
|
2643
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
2644
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
2645
|
+
|
2646
|
+
const int ksc_high = QI3_K/8;
|
2647
|
+
const int shift_high = 2 * ksc;
|
2648
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
2649
|
+
|
2650
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
2651
|
+
|
2652
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
2371
2653
|
}
|
2372
2654
|
}
|
2373
2655
|
|
@@ -2375,63 +2657,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2375
2657
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2658
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
2659
|
|
2378
|
-
__builtin_assume(i >= 0);
|
2379
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
-
__builtin_assume(j >= 0);
|
2381
|
-
__builtin_assume(j < WARP_SIZE);
|
2382
|
-
__builtin_assume(k >= 0);
|
2383
|
-
__builtin_assume(k < WARP_SIZE);
|
2384
|
-
|
2385
2660
|
const int kbx = k / QI3_K;
|
2386
|
-
const int
|
2661
|
+
const int ky = (k % QI3_K) * QR3_K;
|
2662
|
+
const float * x_dmf = (const float *) x_dm;
|
2663
|
+
const float * y_df = (const float *) y_ds;
|
2387
2664
|
|
2388
|
-
const
|
2389
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2665
|
+
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2390
2666
|
|
2391
|
-
|
2392
|
-
|
2393
|
-
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2394
|
-
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2395
|
-
|
2396
|
-
int u[QR3_K];
|
2397
|
-
float d8[QR3_K];
|
2398
|
-
|
2399
|
-
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
-
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
-
u[l] = y_qs[y_qs_index];
|
2402
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
-
}
|
2404
|
-
|
2405
|
-
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
-
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
-
}
|
2408
|
-
|
2409
|
-
#define VDR_q4_K_q8_1 2
|
2410
|
-
|
2411
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
-
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
-
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
-
|
2415
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
-
float sumf_d = 0.0f;
|
2417
|
-
float sumf_m = 0.0f;
|
2667
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2418
2668
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
const int
|
2669
|
+
#pragma unroll
|
2670
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
2671
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
2672
|
+
const int shift = 2 * ((ky % 32) / 8);
|
2673
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2422
2674
|
|
2423
|
-
const int
|
2424
|
-
const int
|
2675
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
2676
|
+
const int vlh = (vh << 2) & 0x04040404;
|
2425
2677
|
|
2426
|
-
|
2427
|
-
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2678
|
+
v[l] = __vsubss4(vll, vlh);
|
2428
2679
|
}
|
2429
2680
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
#else
|
2433
|
-
return 0.0f; // only to satisfy the compiler
|
2434
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2681
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
2682
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
2435
2683
|
}
|
2436
2684
|
|
2437
2685
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
@@ -2478,7 +2726,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2478
2726
|
u[2*i+1] = q8[4];
|
2479
2727
|
}
|
2480
2728
|
|
2481
|
-
return
|
2729
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
2482
2730
|
|
2483
2731
|
#else
|
2484
2732
|
|
@@ -2521,29 +2769,30 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2521
2769
|
return dall * sumf_d - dmin * sumf_m;
|
2522
2770
|
|
2523
2771
|
#else
|
2772
|
+
assert(false);
|
2524
2773
|
return 0.0f; // only to satisfy the compiler
|
2525
2774
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2526
2775
|
|
2527
2776
|
#endif
|
2528
2777
|
}
|
2529
2778
|
|
2530
|
-
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2779
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
2780
|
|
2532
|
-
__shared__ int tile_x_ql[
|
2533
|
-
__shared__ half2 tile_x_dm[
|
2534
|
-
__shared__ int tile_x_sc[
|
2781
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2782
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
2783
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2535
2784
|
|
2536
2785
|
*x_ql = tile_x_ql;
|
2537
2786
|
*x_dm = tile_x_dm;
|
2538
2787
|
*x_sc = tile_x_sc;
|
2539
2788
|
}
|
2540
2789
|
|
2541
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2790
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
2791
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
2792
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
2793
|
|
2545
2794
|
__builtin_assume(i_offset >= 0);
|
2546
|
-
__builtin_assume(i_offset <
|
2795
|
+
__builtin_assume(i_offset < nwarps);
|
2547
2796
|
__builtin_assume(k >= 0);
|
2548
2797
|
__builtin_assume(k < WARP_SIZE);
|
2549
2798
|
|
@@ -2553,7 +2802,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2553
2802
|
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
2803
|
|
2555
2804
|
#pragma unroll
|
2556
|
-
for (int i0 = 0; i0 <
|
2805
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2557
2806
|
int i = i0 + i_offset;
|
2558
2807
|
|
2559
2808
|
if (need_check) {
|
@@ -2566,11 +2815,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2566
2815
|
}
|
2567
2816
|
|
2568
2817
|
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
2818
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
2819
|
|
2571
2820
|
#pragma unroll
|
2572
|
-
for (int i0 = 0; i0 <
|
2573
|
-
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) %
|
2821
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
2822
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
2574
2823
|
|
2575
2824
|
if (need_check) {
|
2576
2825
|
i = min(i, i_max);
|
@@ -2582,8 +2831,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2582
2831
|
}
|
2583
2832
|
|
2584
2833
|
#pragma unroll
|
2585
|
-
for (int i0 = 0; i0 <
|
2586
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
2834
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2835
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2587
2836
|
|
2588
2837
|
if (need_check) {
|
2589
2838
|
i = min(i, i_max);
|
@@ -2591,90 +2840,27 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2591
2840
|
|
2592
2841
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
2842
|
|
2594
|
-
|
2595
|
-
}
|
2596
|
-
}
|
2597
|
-
|
2598
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
-
|
2602
|
-
__builtin_assume(i >= 0);
|
2603
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
-
__builtin_assume(j >= 0);
|
2605
|
-
__builtin_assume(j < WARP_SIZE);
|
2606
|
-
__builtin_assume(k >= 0);
|
2607
|
-
__builtin_assume(k < WARP_SIZE);
|
2608
|
-
|
2609
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
-
|
2612
|
-
int v[2];
|
2613
|
-
int u[2*QR4_K];
|
2614
|
-
float d8[QR4_K];
|
2843
|
+
const int * scales = (int *) bxi->scales;
|
2615
2844
|
|
2616
|
-
|
2617
|
-
const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
|
2845
|
+
const int ksc = k % (WARP_SIZE/8);
|
2618
2846
|
|
2619
|
-
|
2620
|
-
|
2847
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2848
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2849
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2621
2850
|
|
2622
|
-
|
2623
|
-
uint16_t aux[2];
|
2624
|
-
const int l = bq8_offset/2;
|
2625
|
-
if (l < 2) {
|
2626
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2627
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
-
} else {
|
2629
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2851
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2631
2852
|
}
|
2632
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
-
const uint8_t * m = sc + 2;
|
2634
|
-
|
2635
|
-
for (int l = 0; l < QR4_K; ++l) {
|
2636
|
-
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2640
|
-
}
|
2641
|
-
|
2642
|
-
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
|
2643
2853
|
}
|
2644
2854
|
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
-
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2650
|
-
|
2651
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2652
|
-
float sumf_d = 0.0f;
|
2653
|
-
float sumf_m = 0.0f;
|
2654
|
-
|
2655
|
-
for (int i = 0; i < QR5_K; ++i) {
|
2656
|
-
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
-
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2658
|
-
|
2659
|
-
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
2660
|
-
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
2661
|
-
|
2662
|
-
const int v0i = vl0i | vh0i;
|
2663
|
-
const int v1i = vl1i | vh1i;
|
2664
|
-
|
2665
|
-
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
2666
|
-
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
2667
|
-
|
2668
|
-
sumf_d += d8[i] * (dot1 * sc[i]);
|
2669
|
-
sumf_m += d8[i] * (dot2 * m[i]);
|
2670
|
-
|
2671
|
-
}
|
2855
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2856
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2857
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2672
2858
|
|
2673
|
-
|
2859
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2674
2860
|
|
2675
|
-
|
2676
|
-
return
|
2677
|
-
|
2861
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2862
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2863
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2678
2864
|
}
|
2679
2865
|
|
2680
2866
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2711,6 +2897,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2711
2897
|
const uint8_t * sc = (const uint8_t *)aux;
|
2712
2898
|
const uint8_t * m = sc + 2;
|
2713
2899
|
|
2900
|
+
#pragma unroll
|
2714
2901
|
for (int i = 0; i < QR5_K; ++i) {
|
2715
2902
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
2903
|
d8[i] = bq8i->ds.x;
|
@@ -2720,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2720
2907
|
u[2*i+1] = q8[4];
|
2721
2908
|
}
|
2722
2909
|
|
2723
|
-
return
|
2910
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2724
2911
|
|
2725
2912
|
#else
|
2726
2913
|
|
@@ -2759,31 +2946,30 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2759
2946
|
return d * sumf_d;
|
2760
2947
|
|
2761
2948
|
#else
|
2949
|
+
assert(false);
|
2762
2950
|
return 0.0f; // only to satisfy the compiler
|
2763
2951
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2764
2952
|
|
2765
2953
|
#endif
|
2766
2954
|
}
|
2767
2955
|
|
2768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2956
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
2957
|
|
2770
|
-
__shared__ int tile_x_ql[
|
2771
|
-
__shared__ half2 tile_x_dm[
|
2772
|
-
__shared__ int
|
2773
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2958
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2959
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
2960
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2774
2961
|
|
2775
2962
|
*x_ql = tile_x_ql;
|
2776
2963
|
*x_dm = tile_x_dm;
|
2777
|
-
*x_qh = tile_x_qh;
|
2778
2964
|
*x_sc = tile_x_sc;
|
2779
2965
|
}
|
2780
2966
|
|
2781
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2967
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
2968
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
2969
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
2970
|
|
2785
2971
|
__builtin_assume(i_offset >= 0);
|
2786
|
-
__builtin_assume(i_offset <
|
2972
|
+
__builtin_assume(i_offset < nwarps);
|
2787
2973
|
__builtin_assume(k >= 0);
|
2788
2974
|
__builtin_assume(k < WARP_SIZE);
|
2789
2975
|
|
@@ -2793,7 +2979,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2793
2979
|
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
2980
|
|
2795
2981
|
#pragma unroll
|
2796
|
-
for (int i0 = 0; i0 <
|
2982
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2797
2983
|
int i = i0 + i_offset;
|
2798
2984
|
|
2799
2985
|
if (need_check) {
|
@@ -2801,16 +2987,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2801
2987
|
}
|
2802
2988
|
|
2803
2989
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2990
|
+
const int ky = QR5_K*kqsx;
|
2804
2991
|
|
2805
|
-
|
2992
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2993
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
2994
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2995
|
+
|
2996
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
2997
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
2998
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
2999
|
+
|
3000
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
3001
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
3002
|
+
|
3003
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
3004
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
2806
3005
|
}
|
2807
3006
|
|
2808
3007
|
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
3008
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
3009
|
|
2811
3010
|
#pragma unroll
|
2812
|
-
for (int i0 = 0; i0 <
|
2813
|
-
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) %
|
3011
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
3012
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
2814
3013
|
|
2815
3014
|
if (need_check) {
|
2816
3015
|
i = min(i, i_max);
|
@@ -2822,107 +3021,37 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2822
3021
|
}
|
2823
3022
|
|
2824
3023
|
#pragma unroll
|
2825
|
-
for (int i0 = 0; i0 <
|
2826
|
-
int i = i0 + i_offset *
|
2827
|
-
|
2828
|
-
if (need_check) {
|
2829
|
-
i = min(i, i_max);
|
2830
|
-
}
|
2831
|
-
|
2832
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
|
2833
|
-
|
2834
|
-
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
-
}
|
2836
|
-
|
2837
|
-
#pragma unroll
|
2838
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
3024
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3025
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2840
3026
|
|
2841
3027
|
if (need_check) {
|
2842
3028
|
i = min(i, i_max);
|
2843
|
-
}
|
2844
|
-
|
2845
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2846
|
-
|
2847
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
|
2848
|
-
}
|
2849
|
-
}
|
2850
|
-
|
2851
|
-
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
2852
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
|
-
|
2855
|
-
__builtin_assume(i >= 0);
|
2856
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
-
__builtin_assume(j >= 0);
|
2858
|
-
__builtin_assume(j < WARP_SIZE);
|
2859
|
-
__builtin_assume(k >= 0);
|
2860
|
-
__builtin_assume(k < WARP_SIZE);
|
2861
|
-
|
2862
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
-
|
2865
|
-
int vl[2];
|
2866
|
-
int vh[2];
|
2867
|
-
int u[2*QR4_K];
|
2868
|
-
float d8[QR4_K];
|
2869
|
-
|
2870
|
-
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
-
|
2872
|
-
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
-
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
-
|
2875
|
-
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
-
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
-
|
2878
|
-
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
-
uint16_t aux[2];
|
2880
|
-
const int l = bq8_offset/2;
|
2881
|
-
if (l < 2) {
|
2882
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
-
} else {
|
2885
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
-
}
|
2888
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
-
const uint8_t * m = sc + 2;
|
2890
|
-
|
2891
|
-
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
-
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
-
}
|
2897
|
-
|
2898
|
-
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
-
}
|
2900
|
-
|
2901
|
-
#define VDR_q6_K_q8_1 1
|
2902
|
-
|
2903
|
-
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
-
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
-
const float & d, const float * __restrict__ d8) {
|
2906
|
-
|
2907
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
-
float sumf = 0.0f;
|
3029
|
+
}
|
2909
3030
|
|
2910
|
-
|
2911
|
-
const int sc = scales[4*i];
|
3031
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2912
3032
|
|
2913
|
-
const int
|
3033
|
+
const int * scales = (int *) bxi->scales;
|
2914
3034
|
|
2915
|
-
const int
|
3035
|
+
const int ksc = k % (WARP_SIZE/8);
|
2916
3036
|
|
2917
|
-
|
3037
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
3038
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
3039
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2918
3040
|
|
2919
|
-
|
3041
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2920
3042
|
}
|
3043
|
+
}
|
2921
3044
|
|
2922
|
-
|
2923
|
-
|
2924
|
-
|
2925
|
-
|
3045
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3046
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3047
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3048
|
+
|
3049
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3050
|
+
|
3051
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3052
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3053
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3054
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
2926
3055
|
}
|
2927
3056
|
|
2928
3057
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -2942,33 +3071,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
2942
3071
|
int u[QR6_K];
|
2943
3072
|
float d8[QR6_K];
|
2944
3073
|
|
3074
|
+
#pragma unroll
|
2945
3075
|
for (int i = 0; i < QR6_K; ++i) {
|
2946
3076
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
3077
|
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
2948
3078
|
}
|
2949
3079
|
|
2950
|
-
return
|
3080
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
2951
3081
|
}
|
2952
3082
|
|
2953
|
-
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3083
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2954
3084
|
|
2955
|
-
__shared__ int tile_x_ql[
|
2956
|
-
__shared__ half2 tile_x_dm[
|
2957
|
-
__shared__ int
|
2958
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
3085
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3086
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
3087
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2959
3088
|
|
2960
3089
|
*x_ql = tile_x_ql;
|
2961
3090
|
*x_dm = tile_x_dm;
|
2962
|
-
*x_qh = tile_x_qh;
|
2963
3091
|
*x_sc = tile_x_sc;
|
2964
3092
|
}
|
2965
3093
|
|
2966
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3094
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
3095
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
3096
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2969
3097
|
|
2970
3098
|
__builtin_assume(i_offset >= 0);
|
2971
|
-
__builtin_assume(i_offset <
|
3099
|
+
__builtin_assume(i_offset < nwarps);
|
2972
3100
|
__builtin_assume(k >= 0);
|
2973
3101
|
__builtin_assume(k < WARP_SIZE);
|
2974
3102
|
|
@@ -2978,7 +3106,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2978
3106
|
const block_q6_K * bx0 = (block_q6_K *) vx;
|
2979
3107
|
|
2980
3108
|
#pragma unroll
|
2981
|
-
for (int i0 = 0; i0 <
|
3109
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2982
3110
|
int i = i0 + i_offset;
|
2983
3111
|
|
2984
3112
|
if (need_check) {
|
@@ -2986,42 +3114,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2986
3114
|
}
|
2987
3115
|
|
2988
3116
|
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
3117
|
+
const int ky = QR6_K*kqsx;
|
2989
3118
|
|
2990
|
-
|
2991
|
-
|
2992
|
-
|
2993
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2995
|
-
|
2996
|
-
#pragma unroll
|
2997
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
-
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
3119
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
3120
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
3121
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2999
3122
|
|
3000
|
-
|
3001
|
-
|
3002
|
-
|
3123
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
3124
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
3125
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
3003
3126
|
|
3004
|
-
const
|
3127
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
3128
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
3005
3129
|
|
3006
|
-
|
3130
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
3131
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
3007
3132
|
}
|
3008
3133
|
|
3134
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
3135
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
3136
|
+
float * x_dmf = (float *) x_dm;
|
3137
|
+
|
3009
3138
|
#pragma unroll
|
3010
|
-
for (int i0 = 0; i0 <
|
3011
|
-
int i = i0 + i_offset *
|
3139
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
3140
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
3012
3141
|
|
3013
3142
|
if (need_check) {
|
3014
3143
|
i = min(i, i_max);
|
3015
3144
|
}
|
3016
3145
|
|
3017
|
-
const block_q6_K * bxi = bx0 + i*blocks_per_row +
|
3146
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3018
3147
|
|
3019
|
-
|
3148
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
3020
3149
|
}
|
3021
3150
|
|
3022
3151
|
#pragma unroll
|
3023
|
-
for (int i0 = 0; i0 <
|
3024
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
3152
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3153
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
3025
3154
|
|
3026
3155
|
if (need_check) {
|
3027
3156
|
i = min(i, i_max);
|
@@ -3037,41 +3166,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3037
3166
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
3167
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3039
3168
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
__builtin_assume(j >= 0);
|
3043
|
-
__builtin_assume(j < WARP_SIZE);
|
3044
|
-
__builtin_assume(k >= 0);
|
3045
|
-
__builtin_assume(k < WARP_SIZE);
|
3046
|
-
|
3047
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3049
|
-
|
3050
|
-
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
-
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
-
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
3169
|
+
const float * x_dmf = (const float *) x_dm;
|
3170
|
+
const float * y_df = (const float *) y_ds;
|
3053
3171
|
|
3054
|
-
const
|
3172
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
3055
3173
|
|
3056
|
-
const int
|
3057
|
-
const
|
3058
|
-
|
3059
|
-
int u[QR6_K];
|
3060
|
-
float d8[QR6_K];
|
3061
|
-
|
3062
|
-
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
-
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
-
u[l] = y_qs[kqsy];
|
3065
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
3066
|
-
}
|
3067
|
-
|
3068
|
-
return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
|
3069
|
-
x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
|
3174
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
3175
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
3176
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
3070
3177
|
}
|
3071
3178
|
|
3072
|
-
template <int qk, int qr, int qi, typename block_q_t,
|
3179
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3073
3180
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
|
-
static
|
3181
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3075
3182
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3076
3183
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3077
3184
|
|
@@ -3084,14 +3191,10 @@ static __global__ void mul_mat_q(
|
|
3084
3191
|
|
3085
3192
|
const int & ncols_dst = ncols_y;
|
3086
3193
|
|
3087
|
-
const int
|
3088
|
-
const int tid_y = threadIdx.y;
|
3089
|
-
|
3090
|
-
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3194
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
3091
3195
|
const int & row_x_0 = row_dst_0;
|
3092
|
-
const int row_dst = row_dst_0 + tid_x;
|
3093
3196
|
|
3094
|
-
const int col_dst_0 = blockIdx.y*
|
3197
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
3095
3198
|
const int & col_y_0 = col_dst_0;
|
3096
3199
|
|
3097
3200
|
int * tile_x_ql = nullptr;
|
@@ -3101,75 +3204,444 @@ static __global__ void mul_mat_q(
|
|
3101
3204
|
|
3102
3205
|
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
3103
3206
|
|
3104
|
-
|
3105
|
-
|
3106
|
-
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
-
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
3207
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3208
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3108
3209
|
|
3109
|
-
float sum[
|
3210
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3110
3211
|
|
3111
3212
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3112
3213
|
|
3113
3214
|
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
-
|
3215
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
3115
3216
|
|
3217
|
+
#pragma unroll
|
3116
3218
|
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
-
const int kqs = ir*WARP_SIZE +
|
3219
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
3118
3220
|
const int kbxd = kqs / QI8_1;
|
3119
3221
|
|
3120
|
-
|
3121
|
-
|
3222
|
+
#pragma unroll
|
3223
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
3224
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
3122
3225
|
|
3123
3226
|
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
3124
3227
|
|
3125
|
-
|
3228
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
3229
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
3126
3230
|
}
|
3127
|
-
}
|
3128
3231
|
|
3129
|
-
|
3130
|
-
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
3232
|
+
#pragma unroll
|
3233
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
3234
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
3235
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
3236
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3237
|
+
|
3238
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
3239
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
3240
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
3241
|
+
if (need_sum) {
|
3242
|
+
*dsi_dst = *dsi_src;
|
3243
|
+
} else {
|
3244
|
+
float * dfi_dst = (float *) dsi_dst;
|
3245
|
+
*dfi_dst = (*dsi_src).x;
|
3246
|
+
}
|
3247
|
+
}
|
3135
3248
|
|
3136
|
-
|
3249
|
+
__syncthreads();
|
3137
3250
|
|
3138
|
-
#
|
3139
|
-
|
3140
|
-
#endif // __CUDA_ARCH__ >= 700
|
3141
|
-
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3251
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
3252
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
3142
3253
|
#pragma unroll
|
3143
|
-
|
3254
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3144
3255
|
#pragma unroll
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3256
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3257
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
3258
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3259
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
3260
|
+
}
|
3148
3261
|
}
|
3149
3262
|
}
|
3150
|
-
}
|
3151
|
-
|
3152
|
-
__syncthreads();
|
3153
|
-
}
|
3154
3263
|
|
3155
|
-
|
3156
|
-
|
3157
|
-
return;
|
3264
|
+
__syncthreads();
|
3265
|
+
}
|
3158
3266
|
}
|
3159
3267
|
|
3160
|
-
|
3161
|
-
|
3268
|
+
#pragma unroll
|
3269
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3270
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3162
3271
|
|
3163
3272
|
if (col_dst >= ncols_dst) {
|
3164
3273
|
return;
|
3165
3274
|
}
|
3166
3275
|
|
3167
|
-
|
3168
|
-
|
3276
|
+
#pragma unroll
|
3277
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3278
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3279
|
+
|
3280
|
+
if (row_dst >= nrows_dst) {
|
3281
|
+
continue;
|
3282
|
+
}
|
3283
|
+
|
3284
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3169
3285
|
}
|
3170
3286
|
}
|
3171
3287
|
}
|
3172
3288
|
|
3289
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3290
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3291
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3292
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3293
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3294
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3295
|
+
|
3296
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3297
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3298
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3299
|
+
|
3300
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3301
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3302
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3303
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3304
|
+
|
3305
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3306
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3307
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3308
|
+
|
3309
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3310
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3311
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3312
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3313
|
+
|
3314
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3315
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3316
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3317
|
+
#else
|
3318
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3319
|
+
assert(false);
|
3320
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3321
|
+
}
|
3322
|
+
|
3323
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3324
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3325
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3326
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3327
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3328
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3329
|
+
|
3330
|
+
template <bool need_check> static __global__ void
|
3331
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3332
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3333
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3334
|
+
mul_mat_q4_1(
|
3335
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3336
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3337
|
+
|
3338
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3339
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3340
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3341
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3342
|
+
|
3343
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3344
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3345
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3346
|
+
|
3347
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3348
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3349
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3350
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3351
|
+
|
3352
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3353
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3354
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3355
|
+
#else
|
3356
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3357
|
+
assert(false);
|
3358
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3359
|
+
}
|
3360
|
+
|
3361
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3362
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3363
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3364
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3365
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3366
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3367
|
+
|
3368
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3369
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3370
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3371
|
+
|
3372
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3373
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3374
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3375
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3376
|
+
|
3377
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3378
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3379
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3380
|
+
|
3381
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3382
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3383
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3384
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3385
|
+
|
3386
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3387
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3388
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3389
|
+
#else
|
3390
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3391
|
+
assert(false);
|
3392
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3393
|
+
}
|
3394
|
+
|
3395
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3396
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3397
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3398
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3399
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3400
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3401
|
+
|
3402
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3403
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3404
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3405
|
+
|
3406
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3407
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3408
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3409
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3410
|
+
|
3411
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3412
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3413
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3414
|
+
|
3415
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3416
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3417
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3418
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3419
|
+
|
3420
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3421
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3422
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3423
|
+
#else
|
3424
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3425
|
+
assert(false);
|
3426
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3427
|
+
}
|
3428
|
+
|
3429
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3430
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3431
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3432
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3433
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3434
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3435
|
+
|
3436
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3446
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3455
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3464
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3465
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3466
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3468
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3480
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3489
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3499
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3500
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3502
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void
|
3505
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3506
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3507
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3508
|
+
mul_mat_q3_K(
|
3509
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3510
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3511
|
+
|
3512
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3513
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3514
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3515
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3516
|
+
|
3517
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3518
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3520
|
+
|
3521
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3522
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3523
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3524
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3525
|
+
|
3526
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3527
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3528
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3529
|
+
#else
|
3530
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3531
|
+
assert(false);
|
3532
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3533
|
+
}
|
3534
|
+
|
3535
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3536
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3537
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3538
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3539
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3540
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3541
|
+
|
3542
|
+
template <bool need_check> static __global__ void
|
3543
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3544
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3545
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3546
|
+
mul_mat_q4_K(
|
3547
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3548
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3549
|
+
|
3550
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3551
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3552
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3553
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3554
|
+
|
3555
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3556
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3557
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3558
|
+
|
3559
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3560
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3561
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3562
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3563
|
+
|
3564
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3565
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3566
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3567
|
+
#else
|
3568
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3569
|
+
assert(false);
|
3570
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3571
|
+
}
|
3572
|
+
|
3573
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3574
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3575
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3576
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3577
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3578
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3579
|
+
|
3580
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3581
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3583
|
+
|
3584
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3585
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3586
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3587
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3588
|
+
|
3589
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3590
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3591
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3592
|
+
|
3593
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3594
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3595
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3596
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3597
|
+
|
3598
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3599
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3600
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3601
|
+
#else
|
3602
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3603
|
+
assert(false);
|
3604
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3605
|
+
}
|
3606
|
+
|
3607
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3608
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3609
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3610
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3611
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3612
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3613
|
+
|
3614
|
+
template <bool need_check> static __global__ void
|
3615
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3616
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3617
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3618
|
+
mul_mat_q6_K(
|
3619
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3620
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3621
|
+
|
3622
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3623
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3624
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3625
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3626
|
+
|
3627
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3628
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3629
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3630
|
+
|
3631
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3632
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3633
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3634
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3635
|
+
|
3636
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3637
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3638
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3639
|
+
#else
|
3640
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3641
|
+
assert(false);
|
3642
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3643
|
+
}
|
3644
|
+
|
3173
3645
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3174
3646
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3175
3647
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3780,7 +4252,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3780
4252
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3781
4253
|
const dim3 block_nums(1, block_num_y, 1);
|
3782
4254
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3783
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
4255
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
3784
4256
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3785
4257
|
}
|
3786
4258
|
|
@@ -3789,7 +4261,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3789
4261
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3790
4262
|
const dim3 block_nums(1, block_num_y, 1);
|
3791
4263
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3792
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
4264
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
3793
4265
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3794
4266
|
}
|
3795
4267
|
|
@@ -3798,7 +4270,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3798
4270
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3799
4271
|
const dim3 block_nums(1, block_num_y, 1);
|
3800
4272
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3801
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
4273
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
3802
4274
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3803
4275
|
}
|
3804
4276
|
|
@@ -3807,7 +4279,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3807
4279
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3808
4280
|
const dim3 block_nums(1, block_num_y, 1);
|
3809
4281
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3810
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
4282
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
3811
4283
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3812
4284
|
}
|
3813
4285
|
|
@@ -3816,7 +4288,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3816
4288
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3817
4289
|
const dim3 block_nums(1, block_num_y, 1);
|
3818
4290
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3819
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
4291
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
3820
4292
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3821
4293
|
}
|
3822
4294
|
|
@@ -3867,17 +4339,36 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3867
4339
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
4340
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
4341
|
|
3870
|
-
|
3871
|
-
|
4342
|
+
int id;
|
4343
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4344
|
+
const int compute_capability = g_compute_capabilities[id];
|
4345
|
+
|
4346
|
+
int mmq_x, mmq_y, nwarps;
|
4347
|
+
if (compute_capability >= CC_TURING) {
|
4348
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4349
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4350
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4351
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4352
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4353
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4354
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
4355
|
+
} else {
|
4356
|
+
GGML_ASSERT(false);
|
4357
|
+
}
|
4358
|
+
|
4359
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4360
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3872
4361
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3873
|
-
const dim3 block_dims(WARP_SIZE,
|
4362
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3874
4363
|
|
3875
|
-
if (nrows_x %
|
3876
|
-
|
3877
|
-
|
4364
|
+
if (nrows_x % mmq_y == 0) {
|
4365
|
+
const bool need_check = false;
|
4366
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4367
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
4368
|
} else {
|
3879
|
-
|
3880
|
-
|
4369
|
+
const bool need_check = true;
|
4370
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4371
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3881
4372
|
}
|
3882
4373
|
}
|
3883
4374
|
|
@@ -3885,17 +4376,36 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3885
4376
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
4377
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
4378
|
|
3888
|
-
|
3889
|
-
|
4379
|
+
int id;
|
4380
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4381
|
+
const int compute_capability = g_compute_capabilities[id];
|
4382
|
+
|
4383
|
+
int mmq_x, mmq_y, nwarps;
|
4384
|
+
if (compute_capability >= CC_TURING) {
|
4385
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4386
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4387
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4388
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4389
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4390
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4391
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4392
|
+
} else {
|
4393
|
+
GGML_ASSERT(false);
|
4394
|
+
}
|
4395
|
+
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3890
4398
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3891
|
-
const dim3 block_dims(WARP_SIZE,
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3892
4400
|
|
3893
|
-
if (nrows_x %
|
3894
|
-
|
3895
|
-
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4404
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3896
4405
|
} else {
|
3897
|
-
|
3898
|
-
|
4406
|
+
const bool need_check = true;
|
4407
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4408
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3899
4409
|
}
|
3900
4410
|
}
|
3901
4411
|
|
@@ -3903,17 +4413,36 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
3903
4413
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
4414
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
4415
|
|
3906
|
-
|
3907
|
-
|
4416
|
+
int id;
|
4417
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4418
|
+
const int compute_capability = g_compute_capabilities[id];
|
4419
|
+
|
4420
|
+
int mmq_x, mmq_y, nwarps;
|
4421
|
+
if (compute_capability >= CC_TURING) {
|
4422
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4423
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4424
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4425
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4426
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4427
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4428
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4429
|
+
} else {
|
4430
|
+
GGML_ASSERT(false);
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4434
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3908
4435
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3909
|
-
const dim3 block_dims(WARP_SIZE,
|
4436
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3910
4437
|
|
3911
|
-
if (nrows_x %
|
3912
|
-
|
3913
|
-
|
4438
|
+
if (nrows_x % mmq_y == 0) {
|
4439
|
+
const bool need_check = false;
|
4440
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4441
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3914
4442
|
} else {
|
3915
|
-
|
3916
|
-
|
4443
|
+
const bool need_check = true;
|
4444
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4445
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3917
4446
|
}
|
3918
4447
|
}
|
3919
4448
|
|
@@ -3921,17 +4450,36 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
3921
4450
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
4451
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
4452
|
|
3924
|
-
|
3925
|
-
|
4453
|
+
int id;
|
4454
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4455
|
+
const int compute_capability = g_compute_capabilities[id];
|
4456
|
+
|
4457
|
+
int mmq_x, mmq_y, nwarps;
|
4458
|
+
if (compute_capability >= CC_TURING) {
|
4459
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4460
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4461
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4462
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4463
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4464
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4465
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4466
|
+
} else {
|
4467
|
+
GGML_ASSERT(false);
|
4468
|
+
}
|
4469
|
+
|
4470
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4471
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3926
4472
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3927
|
-
const dim3 block_dims(WARP_SIZE,
|
4473
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3928
4474
|
|
3929
|
-
if (nrows_x %
|
3930
|
-
|
3931
|
-
|
4475
|
+
if (nrows_x % mmq_y == 0) {
|
4476
|
+
const bool need_check = false;
|
4477
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4478
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3932
4479
|
} else {
|
3933
|
-
|
3934
|
-
|
4480
|
+
const bool need_check = true;
|
4481
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4482
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3935
4483
|
}
|
3936
4484
|
}
|
3937
4485
|
|
@@ -3939,17 +4487,36 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
3939
4487
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
4488
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
4489
|
|
3942
|
-
|
3943
|
-
|
4490
|
+
int id;
|
4491
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4492
|
+
const int compute_capability = g_compute_capabilities[id];
|
4493
|
+
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
4495
|
+
if (compute_capability >= CC_TURING) {
|
4496
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4503
|
+
} else {
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3944
4509
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3945
|
-
const dim3 block_dims(WARP_SIZE,
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3946
4511
|
|
3947
|
-
if (nrows_x %
|
3948
|
-
|
3949
|
-
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3950
4516
|
} else {
|
3951
|
-
|
3952
|
-
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
4520
|
}
|
3954
4521
|
}
|
3955
4522
|
|
@@ -3957,17 +4524,36 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
3957
4524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
4525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
4526
|
|
3960
|
-
|
3961
|
-
|
4527
|
+
int id;
|
4528
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4529
|
+
const int compute_capability = g_compute_capabilities[id];
|
4530
|
+
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
4532
|
+
if (compute_capability >= CC_TURING) {
|
4533
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4540
|
+
} else {
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3962
4546
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3963
|
-
const dim3 block_dims(WARP_SIZE,
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3964
4548
|
|
3965
|
-
if (nrows_x %
|
3966
|
-
|
3967
|
-
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
4553
|
} else {
|
3969
|
-
|
3970
|
-
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3971
4557
|
}
|
3972
4558
|
}
|
3973
4559
|
|
@@ -3975,17 +4561,36 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
3975
4561
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
4562
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
4563
|
|
3978
|
-
|
3979
|
-
|
4564
|
+
int id;
|
4565
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4566
|
+
const int compute_capability = g_compute_capabilities[id];
|
4567
|
+
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4569
|
+
if (compute_capability >= CC_TURING) {
|
4570
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4577
|
+
} else {
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3980
4583
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3981
|
-
const dim3 block_dims(WARP_SIZE,
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3982
4585
|
|
3983
|
-
if (nrows_x %
|
3984
|
-
|
3985
|
-
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
4590
|
} else {
|
3987
|
-
|
3988
|
-
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3989
4594
|
}
|
3990
4595
|
}
|
3991
4596
|
|
@@ -3993,17 +4598,36 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
3993
4598
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
4599
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
4600
|
|
3996
|
-
|
3997
|
-
|
4601
|
+
int id;
|
4602
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4603
|
+
const int compute_capability = g_compute_capabilities[id];
|
4604
|
+
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4606
|
+
if (compute_capability >= CC_TURING) {
|
4607
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4614
|
+
} else {
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3998
4620
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3999
|
-
const dim3 block_dims(WARP_SIZE,
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4000
4622
|
|
4001
|
-
if (nrows_x %
|
4002
|
-
|
4003
|
-
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4004
4627
|
} else {
|
4005
|
-
|
4006
|
-
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4007
4631
|
}
|
4008
4632
|
}
|
4009
4633
|
|
@@ -4011,17 +4635,36 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4011
4635
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
4636
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
4637
|
|
4014
|
-
|
4015
|
-
|
4638
|
+
int id;
|
4639
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4640
|
+
const int compute_capability = g_compute_capabilities[id];
|
4641
|
+
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4643
|
+
if (compute_capability >= CC_TURING) {
|
4644
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4651
|
+
} else {
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4016
4657
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4017
|
-
const dim3 block_dims(WARP_SIZE,
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4018
4659
|
|
4019
|
-
if (nrows_x %
|
4020
|
-
|
4021
|
-
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
4664
|
} else {
|
4023
|
-
|
4024
|
-
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4025
4668
|
}
|
4026
4669
|
}
|
4027
4670
|
|
@@ -4029,17 +4672,36 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4029
4672
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
4673
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
4674
|
|
4032
|
-
|
4033
|
-
|
4675
|
+
int id;
|
4676
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4677
|
+
const int compute_capability = g_compute_capabilities[id];
|
4678
|
+
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4680
|
+
if (compute_capability >= CC_TURING) {
|
4681
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4688
|
+
} else {
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4034
4694
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4035
|
-
const dim3 block_dims(WARP_SIZE,
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4036
4696
|
|
4037
|
-
if (nrows_x %
|
4038
|
-
|
4039
|
-
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4040
4701
|
} else {
|
4041
|
-
|
4042
|
-
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4043
4705
|
}
|
4044
4706
|
}
|
4045
4707
|
|
@@ -4214,20 +4876,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
4214
4876
|
}
|
4215
4877
|
|
4216
4878
|
|
4217
|
-
static void * g_scratch_buffer = nullptr;
|
4218
|
-
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
4219
|
-
static size_t g_scratch_offset = 0;
|
4220
|
-
|
4221
|
-
static int g_device_count = -1;
|
4222
|
-
static int g_main_device = 0;
|
4223
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
4224
|
-
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
-
static bool g_mul_mat_q = false;
|
4226
|
-
|
4227
|
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
4228
|
-
|
4229
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
4230
|
-
|
4231
4879
|
void ggml_init_cublas() {
|
4232
4880
|
static bool initialized = false;
|
4233
4881
|
|
@@ -4583,6 +5231,37 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
4583
5231
|
(void) i1;
|
4584
5232
|
}
|
4585
5233
|
|
5234
|
+
static int64_t get_row_rounding(ggml_type type) {
|
5235
|
+
int max_compute_capability = INT_MIN;
|
5236
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5237
|
+
if (max_compute_capability < g_compute_capabilities[id]
|
5238
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5239
|
+
max_compute_capability = g_compute_capabilities[id];
|
5240
|
+
}
|
5241
|
+
}
|
5242
|
+
|
5243
|
+
switch(type) {
|
5244
|
+
case GGML_TYPE_Q4_0:
|
5245
|
+
case GGML_TYPE_Q4_1:
|
5246
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5247
|
+
case GGML_TYPE_Q5_0:
|
5248
|
+
case GGML_TYPE_Q5_1:
|
5249
|
+
case GGML_TYPE_Q8_0:
|
5250
|
+
return 64;
|
5251
|
+
case GGML_TYPE_F16:
|
5252
|
+
return 1;
|
5253
|
+
case GGML_TYPE_Q2_K:
|
5254
|
+
case GGML_TYPE_Q3_K:
|
5255
|
+
case GGML_TYPE_Q4_K:
|
5256
|
+
case GGML_TYPE_Q5_K:
|
5257
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5258
|
+
case GGML_TYPE_Q6_K:
|
5259
|
+
return 64;
|
5260
|
+
default:
|
5261
|
+
GGML_ASSERT(false);
|
5262
|
+
}
|
5263
|
+
}
|
5264
|
+
|
4586
5265
|
inline void ggml_cuda_op_mul_mat_vec(
|
4587
5266
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4588
5267
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -4983,14 +5662,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
4983
5662
|
|
4984
5663
|
int64_t row_low, row_high;
|
4985
5664
|
if (split) {
|
5665
|
+
const int64_t rounding = get_row_rounding(src0->type);
|
5666
|
+
|
4986
5667
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
4987
|
-
row_low -= row_low %
|
5668
|
+
row_low -= row_low % rounding;
|
4988
5669
|
|
4989
5670
|
if (id == g_device_count - 1) {
|
4990
5671
|
row_high = nrows0;
|
4991
5672
|
} else {
|
4992
5673
|
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
-
row_high -= row_high %
|
5674
|
+
row_high -= row_high % rounding;
|
4994
5675
|
}
|
4995
5676
|
} else {
|
4996
5677
|
row_low = 0;
|
@@ -5203,7 +5884,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
5203
5884
|
if (split && g_device_count > 1) {
|
5204
5885
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5205
5886
|
for (int id = 0; id < g_device_count; ++id) {
|
5206
|
-
if (id != g_main_device) {
|
5887
|
+
if (id != g_main_device && src0_extra->events[id]) {
|
5207
5888
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
5208
5889
|
}
|
5209
5890
|
}
|
@@ -5347,7 +6028,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
5347
6028
|
} else {
|
5348
6029
|
int min_compute_capability = INT_MAX;
|
5349
6030
|
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6031
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6032
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5351
6033
|
min_compute_capability = g_compute_capabilities[id];
|
5352
6034
|
}
|
5353
6035
|
}
|
@@ -5468,14 +6150,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
5468
6150
|
row_low = 0;
|
5469
6151
|
row_high = nrows;
|
5470
6152
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6153
|
+
const int64_t rounding = get_row_rounding(tensor->type);
|
6154
|
+
|
5471
6155
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
5472
|
-
row_low -= row_low %
|
6156
|
+
row_low -= row_low % rounding;
|
5473
6157
|
|
5474
6158
|
if (id == g_device_count - 1) {
|
5475
6159
|
row_high = nrows;
|
5476
6160
|
} else {
|
5477
6161
|
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
-
row_high -= row_high %
|
6162
|
+
row_high -= row_high % rounding;
|
5479
6163
|
}
|
5480
6164
|
} else {
|
5481
6165
|
GGML_ASSERT(false);
|
@@ -5785,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
5785
6469
|
func(tensor->src[0], tensor->src[1], tensor);
|
5786
6470
|
return true;
|
5787
6471
|
}
|
6472
|
+
|
6473
|
+
int ggml_cuda_get_device_count() {
|
6474
|
+
int device_count;
|
6475
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6476
|
+
return device_count;
|
6477
|
+
}
|
6478
|
+
|
6479
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6480
|
+
cudaDeviceProp prop;
|
6481
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6482
|
+
snprintf(description, description_size, "%s", prop.name);
|
6483
|
+
}
|