llama_cpp 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +44 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1398 -702
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +112 -146
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +51 -9
- data/ext/llama_cpp/src/llama.cpp +390 -210
- data/ext/llama_cpp/src/llama.h +20 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
16
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
#define CC_TURING 700
|
17
18
|
|
18
19
|
#if defined(_MSC_VER)
|
19
20
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
262
263
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
263
264
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
264
265
|
|
265
|
-
#ifndef GGML_CUDA_MMQ_Y
|
266
|
-
#define GGML_CUDA_MMQ_Y 64
|
267
|
-
#endif // GGML_CUDA_MMQ_Y
|
268
|
-
|
269
266
|
// dmmv = dequantize_mul_mat_vec
|
270
267
|
#ifndef GGML_CUDA_DMMV_X
|
271
268
|
#define GGML_CUDA_DMMV_X 32
|
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
|
|
285
282
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
286
283
|
};
|
287
284
|
|
285
|
+
static int g_device_count = -1;
|
286
|
+
static int g_main_device = 0;
|
287
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
|
+
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
+
static bool g_mul_mat_q = false;
|
290
|
+
|
291
|
+
static void * g_scratch_buffer = nullptr;
|
292
|
+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
293
|
+
static size_t g_scratch_offset = 0;
|
294
|
+
|
295
|
+
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
296
|
+
|
297
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
298
|
+
|
288
299
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
289
300
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
290
301
|
|
@@ -1383,9 +1394,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1383
1394
|
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
1395
|
}
|
1385
1396
|
|
1397
|
+
const float2 ds8f = __half22float2(ds8);
|
1398
|
+
|
1386
1399
|
// second part effectively subtracts 8 from each quant value
|
1387
|
-
return d4 * (sumi *
|
1400
|
+
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1388
1401
|
#else
|
1402
|
+
assert(false);
|
1389
1403
|
return 0.0f; // only to satisfy the compiler
|
1390
1404
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1391
1405
|
}
|
@@ -1410,17 +1424,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1410
1424
|
}
|
1411
1425
|
|
1412
1426
|
#ifdef GGML_CUDA_F16
|
1413
|
-
const
|
1414
|
-
const float d4d8 =
|
1415
|
-
const float m4s8 =
|
1427
|
+
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
1428
|
+
const float d4d8 = tmp.x;
|
1429
|
+
const float m4s8 = tmp.y;
|
1416
1430
|
#else
|
1417
|
-
const
|
1418
|
-
const
|
1431
|
+
const float2 dm4f = __half22float2(dm4);
|
1432
|
+
const float2 ds8f = __half22float2(ds8);
|
1433
|
+
const float d4d8 = dm4f.x * ds8f.x;
|
1434
|
+
const float m4s8 = dm4f.y * ds8f.y;
|
1419
1435
|
#endif // GGML_CUDA_F16
|
1420
1436
|
|
1421
1437
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1422
1438
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1423
1439
|
#else
|
1440
|
+
assert(false);
|
1424
1441
|
return 0.0f; // only to satisfy the compiler
|
1425
1442
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1426
1443
|
}
|
@@ -1434,6 +1451,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1434
1451
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1435
1452
|
int sumi = 0;
|
1436
1453
|
|
1454
|
+
#pragma unroll
|
1437
1455
|
for (int i = 0; i < vdr; ++i) {
|
1438
1456
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
1457
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1450,9 +1468,12 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1450
1468
|
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
1469
|
}
|
1452
1470
|
|
1471
|
+
const float2 ds8f = __half22float2(ds8);
|
1472
|
+
|
1453
1473
|
// second part effectively subtracts 16 from each quant value
|
1454
|
-
return d5 * (sumi*
|
1474
|
+
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1455
1475
|
#else
|
1476
|
+
assert(false);
|
1456
1477
|
return 0.0f; // only to satisfy the compiler
|
1457
1478
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1458
1479
|
}
|
@@ -1466,6 +1487,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1466
1487
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1467
1488
|
int sumi = 0;
|
1468
1489
|
|
1490
|
+
#pragma unroll
|
1469
1491
|
for (int i = 0; i < vdr; ++i) {
|
1470
1492
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
1493
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1483,18 +1505,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1483
1505
|
}
|
1484
1506
|
|
1485
1507
|
#ifdef GGML_CUDA_F16
|
1486
|
-
const
|
1487
|
-
const float d5d8 =
|
1488
|
-
const float m5s8 =
|
1508
|
+
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
1509
|
+
const float d5d8 = tmp.x;
|
1510
|
+
const float m5s8 = tmp.y;
|
1489
1511
|
#else
|
1490
|
-
const
|
1491
|
-
const
|
1512
|
+
const float2 dm5f = __half22float2(dm5);
|
1513
|
+
const float2 ds8f = __half22float2(ds8);
|
1514
|
+
const float d5d8 = dm5f.x * ds8f.x;
|
1515
|
+
const float m5s8 = dm5f.y * ds8f.y;
|
1492
1516
|
#endif // GGML_CUDA_F16
|
1493
1517
|
|
1494
1518
|
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
1495
1519
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1496
1520
|
|
1497
1521
|
#else
|
1522
|
+
assert(false);
|
1498
1523
|
return 0.0f; // only to satisfy the compiler
|
1499
1524
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1500
1525
|
}
|
@@ -1503,18 +1528,20 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1503
1528
|
#define VDR_Q8_0_Q8_1_MMQ 8
|
1504
1529
|
|
1505
1530
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
-
const int * v, const int * u, const float & d8_0, const
|
1531
|
+
const int * v, const int * u, const float & d8_0, const float & d8_1) {
|
1507
1532
|
|
1508
1533
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
1534
|
int sumi = 0;
|
1510
1535
|
|
1536
|
+
#pragma unroll
|
1511
1537
|
for (int i = 0; i < vdr; ++i) {
|
1512
1538
|
// SIMD dot product of quantized values
|
1513
1539
|
sumi = __dp4a(v[i], u[i], sumi);
|
1514
1540
|
}
|
1515
1541
|
|
1516
|
-
return
|
1542
|
+
return d8_0*d8_1 * sumi;
|
1517
1543
|
#else
|
1544
|
+
assert(false);
|
1518
1545
|
return 0.0f; // only to satisfy the compiler
|
1519
1546
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1520
1547
|
}
|
@@ -1525,23 +1552,374 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1525
1552
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1526
1553
|
int sumi = 0;
|
1527
1554
|
|
1555
|
+
#pragma unroll
|
1528
1556
|
for (int i = 0; i < vdr; ++i) {
|
1529
1557
|
// SIMD dot product of quantized values
|
1530
1558
|
sumi = __dp4a(v[i], u[i], sumi);
|
1531
1559
|
}
|
1532
1560
|
|
1533
1561
|
#ifdef GGML_CUDA_F16
|
1534
|
-
const
|
1535
|
-
const float d8d8 =
|
1536
|
-
const float m8s8 =
|
1562
|
+
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
1563
|
+
const float d8d8 = tmp.x;
|
1564
|
+
const float m8s8 = tmp.y;
|
1537
1565
|
#else
|
1538
|
-
const
|
1539
|
-
const
|
1566
|
+
const float2 dm8f = __half22float2(dm8);
|
1567
|
+
const float2 ds8f = __half22float2(ds8);
|
1568
|
+
const float d8d8 = dm8f.x * ds8f.x;
|
1569
|
+
const float m8s8 = dm8f.y * ds8f.y;
|
1540
1570
|
#endif // GGML_CUDA_F16
|
1541
1571
|
|
1542
1572
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1543
1573
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1544
1574
|
#else
|
1575
|
+
assert(false);
|
1576
|
+
return 0.0f; // only to satisfy the compiler
|
1577
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1578
|
+
}
|
1579
|
+
|
1580
|
+
#define VDR_Q2_K_Q8_1_MMVQ 1
|
1581
|
+
#define VDR_Q2_K_Q8_1_MMQ 2
|
1582
|
+
|
1583
|
+
// contiguous v/x values
|
1584
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
1585
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1586
|
+
const half2 & dm2, const float * __restrict__ d8) {
|
1587
|
+
|
1588
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1589
|
+
float sumf_d = 0.0f;
|
1590
|
+
float sumf_m = 0.0f;
|
1591
|
+
|
1592
|
+
#pragma unroll
|
1593
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1594
|
+
const int sc = scales[2*i];
|
1595
|
+
|
1596
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1597
|
+
|
1598
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
1599
|
+
|
1600
|
+
// fill int with 4x m
|
1601
|
+
int m = sc >> 4;
|
1602
|
+
m |= m << 8;
|
1603
|
+
m |= m << 16;
|
1604
|
+
sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
1605
|
+
}
|
1606
|
+
|
1607
|
+
const float2 dm2f = __half22float2(dm2);
|
1608
|
+
|
1609
|
+
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1610
|
+
#else
|
1611
|
+
assert(false);
|
1612
|
+
return 0.0f; // only to satisfy the compiler
|
1613
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1614
|
+
}
|
1615
|
+
|
1616
|
+
// contiguous u/y values
|
1617
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
1618
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1619
|
+
const half2 & dm2, const float & d8) {
|
1620
|
+
|
1621
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1622
|
+
int sumi_d = 0;
|
1623
|
+
int sumi_m = 0;
|
1624
|
+
|
1625
|
+
#pragma unroll
|
1626
|
+
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
1627
|
+
int sumi_d_sc = 0;
|
1628
|
+
|
1629
|
+
const int sc = scales[i0 / (QI8_1/2)];
|
1630
|
+
|
1631
|
+
// fill int with 4x m
|
1632
|
+
int m = sc >> 4;
|
1633
|
+
m |= m << 8;
|
1634
|
+
m |= m << 16;
|
1635
|
+
|
1636
|
+
#pragma unroll
|
1637
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1638
|
+
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
1639
|
+
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
1640
|
+
}
|
1641
|
+
|
1642
|
+
sumi_d += sumi_d_sc * (sc & 0xF);
|
1643
|
+
}
|
1644
|
+
|
1645
|
+
const float2 dm2f = __half22float2(dm2);
|
1646
|
+
|
1647
|
+
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1648
|
+
#else
|
1649
|
+
assert(false);
|
1650
|
+
return 0.0f; // only to satisfy the compiler
|
1651
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1652
|
+
}
|
1653
|
+
|
1654
|
+
#define VDR_Q3_K_Q8_1_MMVQ 1
|
1655
|
+
#define VDR_Q3_K_Q8_1_MMQ 2
|
1656
|
+
|
1657
|
+
// contiguous v/x values
|
1658
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
1659
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1660
|
+
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
1661
|
+
|
1662
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1663
|
+
float sumf = 0.0f;
|
1664
|
+
|
1665
|
+
#pragma unroll
|
1666
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1667
|
+
const int isc = scale_offset + 2*i;
|
1668
|
+
|
1669
|
+
const int isc_low = isc % (QK_K/32);
|
1670
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1671
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
1672
|
+
|
1673
|
+
const int isc_high = isc % (QK_K/64);
|
1674
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1675
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1676
|
+
|
1677
|
+
const int sc = (sc_low | sc_high) - 32;
|
1678
|
+
|
1679
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1680
|
+
|
1681
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1682
|
+
|
1683
|
+
const int vi = __vsubss4(vil, vih);
|
1684
|
+
|
1685
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1686
|
+
}
|
1687
|
+
|
1688
|
+
return d3 * sumf;
|
1689
|
+
#else
|
1690
|
+
assert(false);
|
1691
|
+
return 0.0f; // only to satisfy the compiler
|
1692
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1693
|
+
}
|
1694
|
+
|
1695
|
+
// contiguous u/y values
|
1696
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
1697
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1698
|
+
const float & d3, const float & d8) {
|
1699
|
+
|
1700
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1701
|
+
int sumi = 0;
|
1702
|
+
|
1703
|
+
#pragma unroll
|
1704
|
+
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
1705
|
+
int sumi_sc = 0;
|
1706
|
+
|
1707
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1708
|
+
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
1712
|
+
}
|
1713
|
+
|
1714
|
+
return d3*d8 * sumi;
|
1715
|
+
#else
|
1716
|
+
assert(false);
|
1717
|
+
return 0.0f; // only to satisfy the compiler
|
1718
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
#define VDR_Q4_K_Q8_1_MMVQ 2
|
1722
|
+
#define VDR_Q4_K_Q8_1_MMQ 8
|
1723
|
+
|
1724
|
+
// contiguous v/x values
|
1725
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
1726
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1727
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
1728
|
+
|
1729
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1730
|
+
float sumf_d = 0.0f;
|
1731
|
+
float sumf_m = 0.0f;
|
1732
|
+
|
1733
|
+
#pragma unroll
|
1734
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1735
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
1736
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
1737
|
+
|
1738
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
1739
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
1740
|
+
|
1741
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1742
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1743
|
+
}
|
1744
|
+
|
1745
|
+
const float2 dm4f = __half22float2(dm4);
|
1746
|
+
|
1747
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1748
|
+
|
1749
|
+
#else
|
1750
|
+
assert(false);
|
1751
|
+
return 0.0f; // only to satisfy the compiler
|
1752
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1753
|
+
}
|
1754
|
+
|
1755
|
+
// contiguous u/y values
|
1756
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1757
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1758
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1759
|
+
|
1760
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1761
|
+
float sumf_d = 0.0f;
|
1762
|
+
float sumf_m = 0.0f;
|
1763
|
+
|
1764
|
+
#pragma unroll
|
1765
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1766
|
+
int sumi_d = 0;
|
1767
|
+
|
1768
|
+
#pragma unroll
|
1769
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1770
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1771
|
+
}
|
1772
|
+
|
1773
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1774
|
+
|
1775
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1776
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1777
|
+
}
|
1778
|
+
|
1779
|
+
const float2 dm4f = __half22float2(dm4);
|
1780
|
+
|
1781
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1782
|
+
|
1783
|
+
#else
|
1784
|
+
assert(false);
|
1785
|
+
return 0.0f; // only to satisfy the compiler
|
1786
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1787
|
+
}
|
1788
|
+
|
1789
|
+
#define VDR_Q5_K_Q8_1_MMVQ 2
|
1790
|
+
#define VDR_Q5_K_Q8_1_MMQ 8
|
1791
|
+
|
1792
|
+
// contiguous v/x values
|
1793
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1794
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1795
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1796
|
+
|
1797
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1798
|
+
float sumf_d = 0.0f;
|
1799
|
+
float sumf_m = 0.0f;
|
1800
|
+
|
1801
|
+
#pragma unroll
|
1802
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1803
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
1804
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
1805
|
+
|
1806
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
1807
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
1808
|
+
|
1809
|
+
const int v0i = vl0i | vh0i;
|
1810
|
+
const int v1i = vl1i | vh1i;
|
1811
|
+
|
1812
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
1813
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
1814
|
+
|
1815
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1816
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
1817
|
+
|
1818
|
+
}
|
1819
|
+
|
1820
|
+
const float2 dm5f = __half22float2(dm5);
|
1821
|
+
|
1822
|
+
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1823
|
+
|
1824
|
+
#else
|
1825
|
+
assert(false);
|
1826
|
+
return 0.0f; // only to satisfy the compiler
|
1827
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1828
|
+
}
|
1829
|
+
|
1830
|
+
// contiguous u/y values
|
1831
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1832
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1833
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1834
|
+
|
1835
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1836
|
+
float sumf_d = 0.0f;
|
1837
|
+
float sumf_m = 0.0f;
|
1838
|
+
|
1839
|
+
#pragma unroll
|
1840
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1841
|
+
int sumi_d = 0;
|
1842
|
+
|
1843
|
+
#pragma unroll
|
1844
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1845
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1846
|
+
}
|
1847
|
+
|
1848
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1849
|
+
|
1850
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1851
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1852
|
+
}
|
1853
|
+
|
1854
|
+
const float2 dm4f = __half22float2(dm4);
|
1855
|
+
|
1856
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1857
|
+
|
1858
|
+
#else
|
1859
|
+
assert(false);
|
1860
|
+
return 0.0f; // only to satisfy the compiler
|
1861
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
#define VDR_Q6_K_Q8_1_MMVQ 1
|
1865
|
+
#define VDR_Q6_K_Q8_1_MMQ 8
|
1866
|
+
|
1867
|
+
// contiguous v/x values
|
1868
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
1869
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1870
|
+
const float & d, const float * __restrict__ d8) {
|
1871
|
+
|
1872
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1873
|
+
float sumf = 0.0f;
|
1874
|
+
|
1875
|
+
#pragma unroll
|
1876
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1877
|
+
const int sc = scales[4*i];
|
1878
|
+
|
1879
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1880
|
+
|
1881
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
1882
|
+
|
1883
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1884
|
+
|
1885
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1886
|
+
}
|
1887
|
+
|
1888
|
+
return d*sumf;
|
1889
|
+
#else
|
1890
|
+
assert(false);
|
1891
|
+
return 0.0f; // only to satisfy the compiler
|
1892
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1893
|
+
}
|
1894
|
+
|
1895
|
+
// contiguous u/y values
|
1896
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
1897
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
1898
|
+
const float & d6, const float * __restrict__ d8) {
|
1899
|
+
|
1900
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1901
|
+
float sumf_d = 0.0f;
|
1902
|
+
|
1903
|
+
#pragma unroll
|
1904
|
+
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
1905
|
+
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
1906
|
+
|
1907
|
+
#pragma unroll
|
1908
|
+
for (int i = i0; i < i0 + 2; ++i) {
|
1909
|
+
sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
1910
|
+
sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
1911
|
+
|
1912
|
+
sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
1913
|
+
sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
1917
|
+
}
|
1918
|
+
|
1919
|
+
return d6 * sumf_d;
|
1920
|
+
|
1921
|
+
#else
|
1922
|
+
assert(false);
|
1545
1923
|
return 0.0f; // only to satisfy the compiler
|
1546
1924
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
1925
|
}
|
@@ -1564,21 +1942,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
1564
1942
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
1943
|
}
|
1566
1944
|
|
1567
|
-
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1945
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
1946
|
|
1569
|
-
__shared__ int tile_x_qs[
|
1570
|
-
__shared__ float tile_x_d[
|
1947
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
1948
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
1571
1949
|
|
1572
1950
|
*x_ql = tile_x_qs;
|
1573
1951
|
*x_dm = (half2 *) tile_x_d;
|
1574
1952
|
}
|
1575
1953
|
|
1576
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1954
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
1955
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
1956
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
1957
|
|
1580
1958
|
__builtin_assume(i_offset >= 0);
|
1581
|
-
__builtin_assume(i_offset <
|
1959
|
+
__builtin_assume(i_offset < nwarps);
|
1582
1960
|
__builtin_assume(k >= 0);
|
1583
1961
|
__builtin_assume(k < WARP_SIZE);
|
1584
1962
|
|
@@ -1590,7 +1968,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1590
1968
|
float * x_dmf = (float *) x_dm;
|
1591
1969
|
|
1592
1970
|
#pragma unroll
|
1593
|
-
for (int i0 = 0; i0 <
|
1971
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1594
1972
|
int i = i0 + i_offset;
|
1595
1973
|
|
1596
1974
|
if (need_check) {
|
@@ -1600,38 +1978,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1600
1978
|
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
1979
|
|
1602
1980
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
-
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1981
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1604
1982
|
}
|
1605
1983
|
|
1606
|
-
|
1607
|
-
|
1984
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1985
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1608
1986
|
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1987
|
+
#pragma unroll
|
1988
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
1989
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
1990
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1991
|
+
if (need_check) {
|
1992
|
+
i = min(i, i_max);
|
1993
|
+
}
|
1617
1994
|
|
1618
|
-
|
1995
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
1996
|
|
1620
|
-
|
1621
|
-
|
1997
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
1998
|
+
}
|
1622
1999
|
}
|
1623
2000
|
|
1624
2001
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
2002
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
2003
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1627
2004
|
|
1628
|
-
__builtin_assume(i >= 0);
|
1629
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
-
__builtin_assume(j >= 0);
|
1631
|
-
__builtin_assume(j < WARP_SIZE);
|
1632
|
-
__builtin_assume(k >= 0);
|
1633
|
-
__builtin_assume(k < WARP_SIZE);
|
1634
|
-
|
1635
2005
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
2006
|
const float * x_dmf = (float *) x_dm;
|
1637
2007
|
|
@@ -1639,13 +2009,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
|
1639
2009
|
|
1640
2010
|
#pragma unroll
|
1641
2011
|
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
-
u[2*l+0] = y_qs[j *
|
1643
|
-
u[2*l+1] = y_qs[j *
|
2012
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2013
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
1644
2014
|
}
|
1645
2015
|
|
1646
2016
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
2017
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
-
y_ds[j * (
|
2018
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1649
2019
|
}
|
1650
2020
|
|
1651
2021
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
@@ -1666,21 +2036,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
1666
2036
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
2037
|
}
|
1668
2038
|
|
1669
|
-
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2039
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1670
2040
|
|
1671
|
-
__shared__ int tile_x_qs[
|
1672
|
-
__shared__ half2 tile_x_dm[
|
2041
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2042
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
1673
2043
|
|
1674
2044
|
*x_ql = tile_x_qs;
|
1675
2045
|
*x_dm = tile_x_dm;
|
1676
2046
|
}
|
1677
2047
|
|
1678
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2048
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
2049
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
2050
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
2051
|
|
1682
2052
|
__builtin_assume(i_offset >= 0);
|
1683
|
-
__builtin_assume(i_offset <
|
2053
|
+
__builtin_assume(i_offset < nwarps);
|
1684
2054
|
__builtin_assume(k >= 0);
|
1685
2055
|
__builtin_assume(k < WARP_SIZE);
|
1686
2056
|
|
@@ -1690,7 +2060,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1690
2060
|
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1691
2061
|
|
1692
2062
|
#pragma unroll
|
1693
|
-
for (int i0 = 0; i0 <
|
2063
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1694
2064
|
int i = i0 + i_offset;
|
1695
2065
|
|
1696
2066
|
if (need_check) {
|
@@ -1706,7 +2076,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1706
2076
|
const int kbxd = k % blocks_per_tile_x_row;
|
1707
2077
|
|
1708
2078
|
#pragma unroll
|
1709
|
-
for (int i0 = 0; i0 <
|
2079
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
1710
2080
|
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
2081
|
|
1712
2082
|
if (need_check) {
|
@@ -1723,26 +2093,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
|
1723
2093
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
2094
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1725
2095
|
|
1726
|
-
__builtin_assume(i >= 0);
|
1727
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
-
__builtin_assume(j >= 0);
|
1729
|
-
__builtin_assume(j < WARP_SIZE);
|
1730
|
-
__builtin_assume(k >= 0);
|
1731
|
-
__builtin_assume(k < WARP_SIZE);
|
1732
|
-
|
1733
2096
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1734
2097
|
|
1735
2098
|
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1736
2099
|
|
1737
2100
|
#pragma unroll
|
1738
2101
|
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
-
u[2*l+0] = y_qs[j *
|
1740
|
-
u[2*l+1] = y_qs[j *
|
2102
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2103
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
1741
2104
|
}
|
1742
2105
|
|
1743
2106
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
2107
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
-
y_ds[j * (
|
2108
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1746
2109
|
}
|
1747
2110
|
|
1748
2111
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
@@ -1765,21 +2128,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
1765
2128
|
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
2129
|
}
|
1767
2130
|
|
1768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2131
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
2132
|
|
1770
|
-
__shared__ int tile_x_ql[
|
1771
|
-
__shared__ float tile_x_d[
|
2133
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2134
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
1772
2135
|
|
1773
2136
|
*x_ql = tile_x_ql;
|
1774
2137
|
*x_dm = (half2 *) tile_x_d;
|
1775
2138
|
}
|
1776
2139
|
|
1777
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2140
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
2141
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
2142
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
2143
|
|
1781
2144
|
__builtin_assume(i_offset >= 0);
|
1782
|
-
__builtin_assume(i_offset <
|
2145
|
+
__builtin_assume(i_offset < nwarps);
|
1783
2146
|
__builtin_assume(k >= 0);
|
1784
2147
|
__builtin_assume(k < WARP_SIZE);
|
1785
2148
|
|
@@ -1789,7 +2152,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1789
2152
|
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
2153
|
|
1791
2154
|
#pragma unroll
|
1792
|
-
for (int i0 = 0; i0 <
|
2155
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1793
2156
|
int i = i0 + i_offset;
|
1794
2157
|
|
1795
2158
|
if (need_check) {
|
@@ -1825,7 +2188,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1825
2188
|
float * x_dmf = (float *) x_dm;
|
1826
2189
|
|
1827
2190
|
#pragma unroll
|
1828
|
-
for (int i0 = 0; i0 <
|
2191
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
1829
2192
|
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
2193
|
|
1831
2194
|
if (need_check) {
|
@@ -1842,27 +2205,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
|
1842
2205
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
2206
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
2207
|
|
1845
|
-
__builtin_assume(i >= 0);
|
1846
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
-
__builtin_assume(j >= 0);
|
1848
|
-
__builtin_assume(j < WARP_SIZE);
|
1849
|
-
__builtin_assume(k >= 0);
|
1850
|
-
__builtin_assume(k < WARP_SIZE);
|
1851
|
-
|
1852
2208
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
2209
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
-
const float * x_dmf = (float *) x_dm;
|
2210
|
+
const float * x_dmf = (const float *) x_dm;
|
2211
|
+
const float * y_df = (const float *) y_ds;
|
1855
2212
|
|
1856
2213
|
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
2214
|
|
1858
2215
|
#pragma unroll
|
1859
2216
|
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
-
u[2*l+0] = y_qs[j *
|
1861
|
-
u[2*l+1] = y_qs[j *
|
2217
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2218
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
1862
2219
|
}
|
1863
2220
|
|
1864
2221
|
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
|
2222
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1866
2223
|
}
|
1867
2224
|
|
1868
2225
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
@@ -1885,21 +2242,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
1885
2242
|
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
2243
|
}
|
1887
2244
|
|
1888
|
-
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2245
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
2246
|
|
1890
|
-
__shared__ int tile_x_ql[
|
1891
|
-
__shared__ half2 tile_x_dm[
|
2247
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2248
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
1892
2249
|
|
1893
2250
|
*x_ql = tile_x_ql;
|
1894
2251
|
*x_dm = tile_x_dm;
|
1895
2252
|
}
|
1896
2253
|
|
1897
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2254
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
2255
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
2256
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
2257
|
|
1901
2258
|
__builtin_assume(i_offset >= 0);
|
1902
|
-
__builtin_assume(i_offset <
|
2259
|
+
__builtin_assume(i_offset < nwarps);
|
1903
2260
|
__builtin_assume(k >= 0);
|
1904
2261
|
__builtin_assume(k < WARP_SIZE);
|
1905
2262
|
|
@@ -1909,7 +2266,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1909
2266
|
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
2267
|
|
1911
2268
|
#pragma unroll
|
1912
|
-
for (int i0 = 0; i0 <
|
2269
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1913
2270
|
int i = i0 + i_offset;
|
1914
2271
|
|
1915
2272
|
if (need_check) {
|
@@ -1942,7 +2299,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1942
2299
|
const int kbxd = k % blocks_per_tile_x_row;
|
1943
2300
|
|
1944
2301
|
#pragma unroll
|
1945
|
-
for (int i0 = 0; i0 <
|
2302
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
1946
2303
|
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
2304
|
|
1948
2305
|
if (need_check) {
|
@@ -1959,13 +2316,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1959
2316
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
2317
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
2318
|
|
1962
|
-
__builtin_assume(i >= 0);
|
1963
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
-
__builtin_assume(j >= 0);
|
1965
|
-
__builtin_assume(j < WARP_SIZE);
|
1966
|
-
__builtin_assume(k >= 0);
|
1967
|
-
__builtin_assume(k < WARP_SIZE);
|
1968
|
-
|
1969
2319
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
2320
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
2321
|
|
@@ -1973,12 +2323,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1973
2323
|
|
1974
2324
|
#pragma unroll
|
1975
2325
|
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
-
u[2*l+0] = y_qs[j *
|
1977
|
-
u[2*l+1] = y_qs[j *
|
2326
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2327
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
1978
2328
|
}
|
1979
2329
|
|
1980
2330
|
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (
|
2331
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1982
2332
|
}
|
1983
2333
|
|
1984
2334
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
@@ -1989,29 +2339,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
1989
2339
|
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
2340
|
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
2341
|
|
2342
|
+
#pragma unroll
|
1992
2343
|
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
2344
|
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
2345
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
2346
|
}
|
1996
2347
|
|
1997
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
2348
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
1998
2349
|
}
|
1999
2350
|
|
2000
|
-
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2351
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
2352
|
|
2002
|
-
__shared__ int tile_x_qs[
|
2003
|
-
__shared__ float tile_x_d[
|
2353
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2354
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
2004
2355
|
|
2005
2356
|
*x_ql = tile_x_qs;
|
2006
2357
|
*x_dm = (half2 *) tile_x_d;
|
2007
2358
|
}
|
2008
2359
|
|
2009
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2360
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
2361
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
2362
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
2363
|
|
2013
2364
|
__builtin_assume(i_offset >= 0);
|
2014
|
-
__builtin_assume(i_offset <
|
2365
|
+
__builtin_assume(i_offset < nwarps);
|
2015
2366
|
__builtin_assume(k >= 0);
|
2016
2367
|
__builtin_assume(k < WARP_SIZE);
|
2017
2368
|
|
@@ -2022,7 +2373,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2022
2373
|
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
2374
|
|
2024
2375
|
#pragma unroll
|
2025
|
-
for (int i0 = 0; i0 <
|
2376
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2026
2377
|
int i = i0 + i_offset;
|
2027
2378
|
|
2028
2379
|
if (need_check) {
|
@@ -2032,76 +2383,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2032
2383
|
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
2384
|
|
2034
2385
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
-
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
2386
|
}
|
2037
2387
|
|
2038
|
-
|
2039
|
-
|
2388
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2389
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2040
2390
|
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2391
|
+
#pragma unroll
|
2392
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
2393
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
2394
|
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
// }
|
2050
|
-
// #endif // GGML_CUDA_MMQ_Y < 64
|
2395
|
+
if (need_check) {
|
2396
|
+
i = min(i, i_max);
|
2397
|
+
}
|
2051
2398
|
|
2052
|
-
|
2399
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
2400
|
|
2054
|
-
|
2055
|
-
|
2401
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
2402
|
+
}
|
2056
2403
|
}
|
2057
2404
|
|
2058
2405
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
2406
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
2407
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
2408
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
__builtin_assume(j >= 0);
|
2065
|
-
__builtin_assume(j < WARP_SIZE);
|
2066
|
-
__builtin_assume(k >= 0);
|
2067
|
-
__builtin_assume(k < WARP_SIZE);
|
2068
|
-
|
2069
|
-
const float * x_dmf = (float *) x_dm;
|
2409
|
+
const float * x_dmf = (const float *) x_dm;
|
2410
|
+
const float * y_df = (const float *) y_ds;
|
2070
2411
|
|
2071
2412
|
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
2413
|
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
-
|
2074
|
-
}
|
2075
|
-
|
2076
|
-
#define VDR_q2_K_q8_1 1
|
2077
|
-
|
2078
|
-
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
-
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
-
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
-
|
2082
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
-
float sumf_d = 0.0f;
|
2084
|
-
float sumf_m = 0.0f;
|
2085
|
-
|
2086
|
-
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
-
const int sc = scales[2*i];
|
2088
|
-
|
2089
|
-
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
-
|
2091
|
-
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
-
|
2093
|
-
int sc_high = sc >> 4;
|
2094
|
-
sc_high |= sc_high << 8;
|
2095
|
-
sc_high |= sc_high << 16;
|
2096
|
-
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
-
}
|
2098
|
-
|
2099
|
-
const float2 dmf = __half22float2(dm);
|
2100
|
-
|
2101
|
-
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
-
#else
|
2103
|
-
return 0.0f; // only to satisfy the compiler
|
2104
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2414
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2105
2415
|
}
|
2106
2416
|
|
2107
2417
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
@@ -2115,34 +2425,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2115
2425
|
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
2426
|
|
2117
2427
|
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
-
int
|
2428
|
+
int u[QR2_K];
|
2119
2429
|
float d8[QR2_K];
|
2120
2430
|
|
2431
|
+
#pragma unroll
|
2121
2432
|
for (int i = 0; i < QR2_K; ++ i) {
|
2122
2433
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
2434
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
2435
|
}
|
2125
2436
|
|
2126
|
-
return
|
2437
|
+
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
2127
2438
|
}
|
2128
2439
|
|
2129
|
-
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2440
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
2441
|
|
2131
|
-
__shared__ int tile_x_ql[
|
2132
|
-
__shared__ half2 tile_x_dm[
|
2133
|
-
__shared__ int tile_x_sc[
|
2442
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2443
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
2444
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2134
2445
|
|
2135
2446
|
*x_ql = tile_x_ql;
|
2136
2447
|
*x_dm = tile_x_dm;
|
2137
2448
|
*x_sc = tile_x_sc;
|
2138
2449
|
}
|
2139
2450
|
|
2140
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2451
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
2452
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
2453
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
2454
|
|
2144
2455
|
__builtin_assume(i_offset >= 0);
|
2145
|
-
__builtin_assume(i_offset <
|
2456
|
+
__builtin_assume(i_offset < nwarps);
|
2146
2457
|
__builtin_assume(k >= 0);
|
2147
2458
|
__builtin_assume(k < WARP_SIZE);
|
2148
2459
|
|
@@ -2152,7 +2463,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2152
2463
|
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
2464
|
|
2154
2465
|
#pragma unroll
|
2155
|
-
for (int i0 = 0; i0 <
|
2466
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2156
2467
|
int i = i0 + i_offset;
|
2157
2468
|
|
2158
2469
|
if (need_check) {
|
@@ -2168,8 +2479,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2168
2479
|
const int kbxd = k % blocks_per_tile_x_row;
|
2169
2480
|
|
2170
2481
|
#pragma unroll
|
2171
|
-
for (int i0 = 0; i0 <
|
2172
|
-
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) %
|
2482
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
2483
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
2173
2484
|
|
2174
2485
|
if (need_check) {
|
2175
2486
|
i = min(i, i_max);
|
@@ -2181,7 +2492,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2181
2492
|
}
|
2182
2493
|
|
2183
2494
|
#pragma unroll
|
2184
|
-
for (int i0 = 0; i0 <
|
2495
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2185
2496
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
2497
|
|
2187
2498
|
if (need_check) {
|
@@ -2198,68 +2509,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
|
2198
2509
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
2510
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
2511
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
__builtin_assume(j < WARP_SIZE);
|
2205
|
-
__builtin_assume(k >= 0);
|
2206
|
-
__builtin_assume(k < WARP_SIZE);
|
2207
|
-
|
2208
|
-
const int kbx = k / QI2_K;
|
2209
|
-
const int kqsx = k % QI2_K;
|
2210
|
-
|
2211
|
-
const int bq8_offset = QR2_K * (kqsx / QI8_1);
|
2212
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2512
|
+
const int kbx = k / QI2_K;
|
2513
|
+
const int ky = (k % QI2_K) * QR2_K;
|
2514
|
+
const float * y_df = (const float *) y_ds;
|
2213
2515
|
|
2214
|
-
|
2516
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
2215
2517
|
|
2216
|
-
int
|
2217
|
-
|
2518
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
2519
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
2218
2520
|
|
2219
|
-
|
2220
|
-
|
2221
|
-
|
2222
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2521
|
+
#pragma unroll
|
2522
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
2523
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2223
2524
|
}
|
2224
2525
|
|
2225
|
-
|
2226
|
-
}
|
2227
|
-
|
2228
|
-
#define VDR_q3_K_q8_1 1
|
2229
|
-
|
2230
|
-
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
-
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
-
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
-
|
2234
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
-
float sumf = 0.0f;
|
2236
|
-
|
2237
|
-
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
-
const int isc = scale_offset + 2*i;
|
2239
|
-
|
2240
|
-
const int isc_low = isc % (QK_K/32);
|
2241
|
-
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
2242
|
-
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
-
|
2244
|
-
const int isc_high = isc % (QK_K/64);
|
2245
|
-
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
-
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
-
|
2248
|
-
const int sc = (sc_low | sc_high) - 32;
|
2249
|
-
|
2250
|
-
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
-
|
2252
|
-
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
-
|
2254
|
-
const int vi = __vsubss4(vil, vih);
|
2526
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
2255
2527
|
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
return d*sumf;
|
2260
|
-
#else
|
2261
|
-
return 0.0f; // only to satisfy the compiler
|
2262
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2528
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
2529
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
2263
2530
|
}
|
2264
2531
|
|
2265
2532
|
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
@@ -2277,23 +2544,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2277
2544
|
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
2545
|
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
2546
|
|
2280
|
-
int
|
2547
|
+
int u[QR3_K];
|
2281
2548
|
float d8[QR3_K];
|
2282
2549
|
|
2550
|
+
#pragma unroll
|
2283
2551
|
for (int i = 0; i < QR3_K; ++i) {
|
2284
2552
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
2553
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
2554
|
}
|
2287
2555
|
|
2288
|
-
return
|
2556
|
+
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
2557
|
}
|
2290
2558
|
|
2291
|
-
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2559
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
2560
|
|
2293
|
-
__shared__ int tile_x_ql[
|
2294
|
-
__shared__ half2 tile_x_dm[
|
2295
|
-
__shared__ int tile_x_qh[
|
2296
|
-
__shared__ int tile_x_sc[
|
2561
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2562
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
2563
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
2564
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2297
2565
|
|
2298
2566
|
*x_ql = tile_x_ql;
|
2299
2567
|
*x_dm = tile_x_dm;
|
@@ -2301,12 +2569,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
|
|
2301
2569
|
*x_sc = tile_x_sc;
|
2302
2570
|
}
|
2303
2571
|
|
2304
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2572
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
2573
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
2574
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
2575
|
|
2308
2576
|
__builtin_assume(i_offset >= 0);
|
2309
|
-
__builtin_assume(i_offset <
|
2577
|
+
__builtin_assume(i_offset < nwarps);
|
2310
2578
|
__builtin_assume(k >= 0);
|
2311
2579
|
__builtin_assume(k < WARP_SIZE);
|
2312
2580
|
|
@@ -2316,7 +2584,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2316
2584
|
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
2585
|
|
2318
2586
|
#pragma unroll
|
2319
|
-
for (int i0 = 0; i0 <
|
2587
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2320
2588
|
int i = i0 + i_offset;
|
2321
2589
|
|
2322
2590
|
if (need_check) {
|
@@ -2330,10 +2598,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2330
2598
|
|
2331
2599
|
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
2600
|
const int kbxd = k % blocks_per_tile_x_row;
|
2601
|
+
float * x_dmf = (float *) x_dm;
|
2333
2602
|
|
2334
2603
|
#pragma unroll
|
2335
|
-
for (int i0 = 0; i0 <
|
2336
|
-
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) %
|
2604
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
2605
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
2337
2606
|
|
2338
2607
|
if (need_check) {
|
2339
2608
|
i = min(i, i_max);
|
@@ -2341,11 +2610,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2341
2610
|
|
2342
2611
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
2612
|
|
2344
|
-
|
2613
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
2345
2614
|
}
|
2346
2615
|
|
2347
2616
|
#pragma unroll
|
2348
|
-
for (int i0 = 0; i0 <
|
2617
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
2349
2618
|
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
2619
|
|
2351
2620
|
if (need_check) {
|
@@ -2354,11 +2623,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2354
2623
|
|
2355
2624
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
2625
|
|
2357
|
-
|
2626
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2627
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
2628
|
}
|
2359
2629
|
|
2360
2630
|
#pragma unroll
|
2361
|
-
for (int i0 = 0; i0 <
|
2631
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2362
2632
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
2633
|
|
2364
2634
|
if (need_check) {
|
@@ -2367,7 +2637,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2367
2637
|
|
2368
2638
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
2639
|
|
2370
|
-
|
2640
|
+
const int ksc = k % (QI3_K/4);
|
2641
|
+
|
2642
|
+
const int ksc_low = ksc % (QI3_K/8);
|
2643
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
2644
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
2645
|
+
|
2646
|
+
const int ksc_high = QI3_K/8;
|
2647
|
+
const int shift_high = 2 * ksc;
|
2648
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
2649
|
+
|
2650
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
2651
|
+
|
2652
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
2371
2653
|
}
|
2372
2654
|
}
|
2373
2655
|
|
@@ -2375,63 +2657,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2375
2657
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2658
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
2659
|
|
2378
|
-
__builtin_assume(i >= 0);
|
2379
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
-
__builtin_assume(j >= 0);
|
2381
|
-
__builtin_assume(j < WARP_SIZE);
|
2382
|
-
__builtin_assume(k >= 0);
|
2383
|
-
__builtin_assume(k < WARP_SIZE);
|
2384
|
-
|
2385
2660
|
const int kbx = k / QI3_K;
|
2386
|
-
const int
|
2661
|
+
const int ky = (k % QI3_K) * QR3_K;
|
2662
|
+
const float * x_dmf = (const float *) x_dm;
|
2663
|
+
const float * y_df = (const float *) y_ds;
|
2387
2664
|
|
2388
|
-
const
|
2389
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2665
|
+
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2390
2666
|
|
2391
|
-
|
2392
|
-
|
2393
|
-
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2394
|
-
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2395
|
-
|
2396
|
-
int u[QR3_K];
|
2397
|
-
float d8[QR3_K];
|
2398
|
-
|
2399
|
-
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
-
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
-
u[l] = y_qs[y_qs_index];
|
2402
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
-
}
|
2404
|
-
|
2405
|
-
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
-
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
-
}
|
2408
|
-
|
2409
|
-
#define VDR_q4_K_q8_1 2
|
2410
|
-
|
2411
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
-
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
-
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
-
|
2415
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
-
float sumf_d = 0.0f;
|
2417
|
-
float sumf_m = 0.0f;
|
2667
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2418
2668
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
const int
|
2669
|
+
#pragma unroll
|
2670
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
2671
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
2672
|
+
const int shift = 2 * ((ky % 32) / 8);
|
2673
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2422
2674
|
|
2423
|
-
const int
|
2424
|
-
const int
|
2675
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
2676
|
+
const int vlh = (vh << 2) & 0x04040404;
|
2425
2677
|
|
2426
|
-
|
2427
|
-
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2678
|
+
v[l] = __vsubss4(vll, vlh);
|
2428
2679
|
}
|
2429
2680
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
#else
|
2433
|
-
return 0.0f; // only to satisfy the compiler
|
2434
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2681
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
2682
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
2435
2683
|
}
|
2436
2684
|
|
2437
2685
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
@@ -2478,7 +2726,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2478
2726
|
u[2*i+1] = q8[4];
|
2479
2727
|
}
|
2480
2728
|
|
2481
|
-
return
|
2729
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
2482
2730
|
|
2483
2731
|
#else
|
2484
2732
|
|
@@ -2521,29 +2769,30 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2521
2769
|
return dall * sumf_d - dmin * sumf_m;
|
2522
2770
|
|
2523
2771
|
#else
|
2772
|
+
assert(false);
|
2524
2773
|
return 0.0f; // only to satisfy the compiler
|
2525
2774
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2526
2775
|
|
2527
2776
|
#endif
|
2528
2777
|
}
|
2529
2778
|
|
2530
|
-
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2779
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
2780
|
|
2532
|
-
__shared__ int tile_x_ql[
|
2533
|
-
__shared__ half2 tile_x_dm[
|
2534
|
-
__shared__ int tile_x_sc[
|
2781
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2782
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
2783
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2535
2784
|
|
2536
2785
|
*x_ql = tile_x_ql;
|
2537
2786
|
*x_dm = tile_x_dm;
|
2538
2787
|
*x_sc = tile_x_sc;
|
2539
2788
|
}
|
2540
2789
|
|
2541
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2790
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
2791
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
2792
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
2793
|
|
2545
2794
|
__builtin_assume(i_offset >= 0);
|
2546
|
-
__builtin_assume(i_offset <
|
2795
|
+
__builtin_assume(i_offset < nwarps);
|
2547
2796
|
__builtin_assume(k >= 0);
|
2548
2797
|
__builtin_assume(k < WARP_SIZE);
|
2549
2798
|
|
@@ -2553,7 +2802,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2553
2802
|
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
2803
|
|
2555
2804
|
#pragma unroll
|
2556
|
-
for (int i0 = 0; i0 <
|
2805
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2557
2806
|
int i = i0 + i_offset;
|
2558
2807
|
|
2559
2808
|
if (need_check) {
|
@@ -2566,11 +2815,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2566
2815
|
}
|
2567
2816
|
|
2568
2817
|
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
2818
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
2819
|
|
2571
2820
|
#pragma unroll
|
2572
|
-
for (int i0 = 0; i0 <
|
2573
|
-
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) %
|
2821
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
2822
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
2574
2823
|
|
2575
2824
|
if (need_check) {
|
2576
2825
|
i = min(i, i_max);
|
@@ -2582,8 +2831,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2582
2831
|
}
|
2583
2832
|
|
2584
2833
|
#pragma unroll
|
2585
|
-
for (int i0 = 0; i0 <
|
2586
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
2834
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2835
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2587
2836
|
|
2588
2837
|
if (need_check) {
|
2589
2838
|
i = min(i, i_max);
|
@@ -2591,90 +2840,27 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2591
2840
|
|
2592
2841
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
2842
|
|
2594
|
-
|
2595
|
-
}
|
2596
|
-
}
|
2597
|
-
|
2598
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
-
|
2602
|
-
__builtin_assume(i >= 0);
|
2603
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
-
__builtin_assume(j >= 0);
|
2605
|
-
__builtin_assume(j < WARP_SIZE);
|
2606
|
-
__builtin_assume(k >= 0);
|
2607
|
-
__builtin_assume(k < WARP_SIZE);
|
2608
|
-
|
2609
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
-
|
2612
|
-
int v[2];
|
2613
|
-
int u[2*QR4_K];
|
2614
|
-
float d8[QR4_K];
|
2843
|
+
const int * scales = (int *) bxi->scales;
|
2615
2844
|
|
2616
|
-
|
2617
|
-
const int bq8_offset = QR4_K * ((kqsx/2) / (QI8_1/2));
|
2845
|
+
const int ksc = k % (WARP_SIZE/8);
|
2618
2846
|
|
2619
|
-
|
2620
|
-
|
2847
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2848
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2849
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2621
2850
|
|
2622
|
-
|
2623
|
-
uint16_t aux[2];
|
2624
|
-
const int l = bq8_offset/2;
|
2625
|
-
if (l < 2) {
|
2626
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2627
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
-
} else {
|
2629
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2851
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2631
2852
|
}
|
2632
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
-
const uint8_t * m = sc + 2;
|
2634
|
-
|
2635
|
-
for (int l = 0; l < QR4_K; ++l) {
|
2636
|
-
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2640
|
-
}
|
2641
|
-
|
2642
|
-
return vec_dot_q4_K_q8_1_impl(v, u, sc, m, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K + kbx], d8);
|
2643
2853
|
}
|
2644
2854
|
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
-
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2650
|
-
|
2651
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2652
|
-
float sumf_d = 0.0f;
|
2653
|
-
float sumf_m = 0.0f;
|
2654
|
-
|
2655
|
-
for (int i = 0; i < QR5_K; ++i) {
|
2656
|
-
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
-
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2658
|
-
|
2659
|
-
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
2660
|
-
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
2661
|
-
|
2662
|
-
const int v0i = vl0i | vh0i;
|
2663
|
-
const int v1i = vl1i | vh1i;
|
2664
|
-
|
2665
|
-
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
2666
|
-
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
2667
|
-
|
2668
|
-
sumf_d += d8[i] * (dot1 * sc[i]);
|
2669
|
-
sumf_m += d8[i] * (dot2 * m[i]);
|
2670
|
-
|
2671
|
-
}
|
2855
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2856
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2857
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2672
2858
|
|
2673
|
-
|
2859
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2674
2860
|
|
2675
|
-
|
2676
|
-
return
|
2677
|
-
|
2861
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2862
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2863
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2678
2864
|
}
|
2679
2865
|
|
2680
2866
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2711,6 +2897,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2711
2897
|
const uint8_t * sc = (const uint8_t *)aux;
|
2712
2898
|
const uint8_t * m = sc + 2;
|
2713
2899
|
|
2900
|
+
#pragma unroll
|
2714
2901
|
for (int i = 0; i < QR5_K; ++i) {
|
2715
2902
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
2903
|
d8[i] = bq8i->ds.x;
|
@@ -2720,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2720
2907
|
u[2*i+1] = q8[4];
|
2721
2908
|
}
|
2722
2909
|
|
2723
|
-
return
|
2910
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2724
2911
|
|
2725
2912
|
#else
|
2726
2913
|
|
@@ -2759,31 +2946,30 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2759
2946
|
return d * sumf_d;
|
2760
2947
|
|
2761
2948
|
#else
|
2949
|
+
assert(false);
|
2762
2950
|
return 0.0f; // only to satisfy the compiler
|
2763
2951
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2764
2952
|
|
2765
2953
|
#endif
|
2766
2954
|
}
|
2767
2955
|
|
2768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2956
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
2957
|
|
2770
|
-
__shared__ int tile_x_ql[
|
2771
|
-
__shared__ half2 tile_x_dm[
|
2772
|
-
__shared__ int
|
2773
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2958
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2959
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
2960
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2774
2961
|
|
2775
2962
|
*x_ql = tile_x_ql;
|
2776
2963
|
*x_dm = tile_x_dm;
|
2777
|
-
*x_qh = tile_x_qh;
|
2778
2964
|
*x_sc = tile_x_sc;
|
2779
2965
|
}
|
2780
2966
|
|
2781
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2967
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
2968
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
2969
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
2970
|
|
2785
2971
|
__builtin_assume(i_offset >= 0);
|
2786
|
-
__builtin_assume(i_offset <
|
2972
|
+
__builtin_assume(i_offset < nwarps);
|
2787
2973
|
__builtin_assume(k >= 0);
|
2788
2974
|
__builtin_assume(k < WARP_SIZE);
|
2789
2975
|
|
@@ -2793,7 +2979,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2793
2979
|
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
2980
|
|
2795
2981
|
#pragma unroll
|
2796
|
-
for (int i0 = 0; i0 <
|
2982
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2797
2983
|
int i = i0 + i_offset;
|
2798
2984
|
|
2799
2985
|
if (need_check) {
|
@@ -2801,16 +2987,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2801
2987
|
}
|
2802
2988
|
|
2803
2989
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2990
|
+
const int ky = QR5_K*kqsx;
|
2804
2991
|
|
2805
|
-
|
2992
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2993
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
2994
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2995
|
+
|
2996
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
2997
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
2998
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
2999
|
+
|
3000
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
3001
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
3002
|
+
|
3003
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
3004
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
2806
3005
|
}
|
2807
3006
|
|
2808
3007
|
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
3008
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
3009
|
|
2811
3010
|
#pragma unroll
|
2812
|
-
for (int i0 = 0; i0 <
|
2813
|
-
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) %
|
3011
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
3012
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
2814
3013
|
|
2815
3014
|
if (need_check) {
|
2816
3015
|
i = min(i, i_max);
|
@@ -2822,107 +3021,37 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2822
3021
|
}
|
2823
3022
|
|
2824
3023
|
#pragma unroll
|
2825
|
-
for (int i0 = 0; i0 <
|
2826
|
-
int i = i0 + i_offset *
|
2827
|
-
|
2828
|
-
if (need_check) {
|
2829
|
-
i = min(i, i_max);
|
2830
|
-
}
|
2831
|
-
|
2832
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI5_K/4);
|
2833
|
-
|
2834
|
-
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
-
}
|
2836
|
-
|
2837
|
-
#pragma unroll
|
2838
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
3024
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3025
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2840
3026
|
|
2841
3027
|
if (need_check) {
|
2842
3028
|
i = min(i, i_max);
|
2843
|
-
}
|
2844
|
-
|
2845
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2846
|
-
|
2847
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI5_K/8));
|
2848
|
-
}
|
2849
|
-
}
|
2850
|
-
|
2851
|
-
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
2852
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
|
-
|
2855
|
-
__builtin_assume(i >= 0);
|
2856
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
-
__builtin_assume(j >= 0);
|
2858
|
-
__builtin_assume(j < WARP_SIZE);
|
2859
|
-
__builtin_assume(k >= 0);
|
2860
|
-
__builtin_assume(k < WARP_SIZE);
|
2861
|
-
|
2862
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
-
|
2865
|
-
int vl[2];
|
2866
|
-
int vh[2];
|
2867
|
-
int u[2*QR4_K];
|
2868
|
-
float d8[QR4_K];
|
2869
|
-
|
2870
|
-
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
-
|
2872
|
-
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
-
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
-
|
2875
|
-
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
-
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
-
|
2878
|
-
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
-
uint16_t aux[2];
|
2880
|
-
const int l = bq8_offset/2;
|
2881
|
-
if (l < 2) {
|
2882
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
-
} else {
|
2885
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
-
}
|
2888
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
-
const uint8_t * m = sc + 2;
|
2890
|
-
|
2891
|
-
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
-
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
-
}
|
2897
|
-
|
2898
|
-
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
-
}
|
2900
|
-
|
2901
|
-
#define VDR_q6_K_q8_1 1
|
2902
|
-
|
2903
|
-
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
-
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
-
const float & d, const float * __restrict__ d8) {
|
2906
|
-
|
2907
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
-
float sumf = 0.0f;
|
3029
|
+
}
|
2909
3030
|
|
2910
|
-
|
2911
|
-
const int sc = scales[4*i];
|
3031
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2912
3032
|
|
2913
|
-
const int
|
3033
|
+
const int * scales = (int *) bxi->scales;
|
2914
3034
|
|
2915
|
-
const int
|
3035
|
+
const int ksc = k % (WARP_SIZE/8);
|
2916
3036
|
|
2917
|
-
|
3037
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
3038
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
3039
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2918
3040
|
|
2919
|
-
|
3041
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2920
3042
|
}
|
3043
|
+
}
|
2921
3044
|
|
2922
|
-
|
2923
|
-
|
2924
|
-
|
2925
|
-
|
3045
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3046
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3047
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3048
|
+
|
3049
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3050
|
+
|
3051
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3052
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3053
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3054
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
2926
3055
|
}
|
2927
3056
|
|
2928
3057
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -2942,33 +3071,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
2942
3071
|
int u[QR6_K];
|
2943
3072
|
float d8[QR6_K];
|
2944
3073
|
|
3074
|
+
#pragma unroll
|
2945
3075
|
for (int i = 0; i < QR6_K; ++i) {
|
2946
3076
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
3077
|
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
2948
3078
|
}
|
2949
3079
|
|
2950
|
-
return
|
3080
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
2951
3081
|
}
|
2952
3082
|
|
2953
|
-
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3083
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2954
3084
|
|
2955
|
-
__shared__ int tile_x_ql[
|
2956
|
-
__shared__ half2 tile_x_dm[
|
2957
|
-
__shared__ int
|
2958
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
3085
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3086
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
3087
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2959
3088
|
|
2960
3089
|
*x_ql = tile_x_ql;
|
2961
3090
|
*x_dm = tile_x_dm;
|
2962
|
-
*x_qh = tile_x_qh;
|
2963
3091
|
*x_sc = tile_x_sc;
|
2964
3092
|
}
|
2965
3093
|
|
2966
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3094
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
3095
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
3096
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2969
3097
|
|
2970
3098
|
__builtin_assume(i_offset >= 0);
|
2971
|
-
__builtin_assume(i_offset <
|
3099
|
+
__builtin_assume(i_offset < nwarps);
|
2972
3100
|
__builtin_assume(k >= 0);
|
2973
3101
|
__builtin_assume(k < WARP_SIZE);
|
2974
3102
|
|
@@ -2978,7 +3106,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2978
3106
|
const block_q6_K * bx0 = (block_q6_K *) vx;
|
2979
3107
|
|
2980
3108
|
#pragma unroll
|
2981
|
-
for (int i0 = 0; i0 <
|
3109
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2982
3110
|
int i = i0 + i_offset;
|
2983
3111
|
|
2984
3112
|
if (need_check) {
|
@@ -2986,42 +3114,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2986
3114
|
}
|
2987
3115
|
|
2988
3116
|
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
3117
|
+
const int ky = QR6_K*kqsx;
|
2989
3118
|
|
2990
|
-
|
2991
|
-
|
2992
|
-
|
2993
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2995
|
-
|
2996
|
-
#pragma unroll
|
2997
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
-
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
3119
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
3120
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
3121
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2999
3122
|
|
3000
|
-
|
3001
|
-
|
3002
|
-
|
3123
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
3124
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
3125
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
3003
3126
|
|
3004
|
-
const
|
3127
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
3128
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
3005
3129
|
|
3006
|
-
|
3130
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
3131
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
3007
3132
|
}
|
3008
3133
|
|
3134
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
3135
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
3136
|
+
float * x_dmf = (float *) x_dm;
|
3137
|
+
|
3009
3138
|
#pragma unroll
|
3010
|
-
for (int i0 = 0; i0 <
|
3011
|
-
int i = i0 + i_offset *
|
3139
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
3140
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
3012
3141
|
|
3013
3142
|
if (need_check) {
|
3014
3143
|
i = min(i, i_max);
|
3015
3144
|
}
|
3016
3145
|
|
3017
|
-
const block_q6_K * bxi = bx0 + i*blocks_per_row +
|
3146
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3018
3147
|
|
3019
|
-
|
3148
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
3020
3149
|
}
|
3021
3150
|
|
3022
3151
|
#pragma unroll
|
3023
|
-
for (int i0 = 0; i0 <
|
3024
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
3152
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3153
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
3025
3154
|
|
3026
3155
|
if (need_check) {
|
3027
3156
|
i = min(i, i_max);
|
@@ -3037,41 +3166,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3037
3166
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
3167
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3039
3168
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
__builtin_assume(j >= 0);
|
3043
|
-
__builtin_assume(j < WARP_SIZE);
|
3044
|
-
__builtin_assume(k >= 0);
|
3045
|
-
__builtin_assume(k < WARP_SIZE);
|
3046
|
-
|
3047
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3049
|
-
|
3050
|
-
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
-
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
-
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
3169
|
+
const float * x_dmf = (const float *) x_dm;
|
3170
|
+
const float * y_df = (const float *) y_ds;
|
3053
3171
|
|
3054
|
-
const
|
3172
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
3055
3173
|
|
3056
|
-
const int
|
3057
|
-
const
|
3058
|
-
|
3059
|
-
int u[QR6_K];
|
3060
|
-
float d8[QR6_K];
|
3061
|
-
|
3062
|
-
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
-
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
-
u[l] = y_qs[kqsy];
|
3065
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
3066
|
-
}
|
3067
|
-
|
3068
|
-
return vec_dot_q6_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales,
|
3069
|
-
x_dm[i * (WARP_SIZE/QI6_K) + i/QI6_K + kbx].x, d8);
|
3174
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
3175
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
3176
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
3070
3177
|
}
|
3071
3178
|
|
3072
|
-
template <int qk, int qr, int qi, typename block_q_t,
|
3179
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3073
3180
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
|
-
static
|
3181
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3075
3182
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3076
3183
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3077
3184
|
|
@@ -3084,14 +3191,10 @@ static __global__ void mul_mat_q(
|
|
3084
3191
|
|
3085
3192
|
const int & ncols_dst = ncols_y;
|
3086
3193
|
|
3087
|
-
const int
|
3088
|
-
const int tid_y = threadIdx.y;
|
3089
|
-
|
3090
|
-
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3194
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
3091
3195
|
const int & row_x_0 = row_dst_0;
|
3092
|
-
const int row_dst = row_dst_0 + tid_x;
|
3093
3196
|
|
3094
|
-
const int col_dst_0 = blockIdx.y*
|
3197
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
3095
3198
|
const int & col_y_0 = col_dst_0;
|
3096
3199
|
|
3097
3200
|
int * tile_x_ql = nullptr;
|
@@ -3101,75 +3204,444 @@ static __global__ void mul_mat_q(
|
|
3101
3204
|
|
3102
3205
|
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
3103
3206
|
|
3104
|
-
|
3105
|
-
|
3106
|
-
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
-
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
3207
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3208
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3108
3209
|
|
3109
|
-
float sum[
|
3210
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3110
3211
|
|
3111
3212
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3112
3213
|
|
3113
3214
|
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
-
|
3215
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
3115
3216
|
|
3217
|
+
#pragma unroll
|
3116
3218
|
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
-
const int kqs = ir*WARP_SIZE +
|
3219
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
3118
3220
|
const int kbxd = kqs / QI8_1;
|
3119
3221
|
|
3120
|
-
|
3121
|
-
|
3222
|
+
#pragma unroll
|
3223
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
3224
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
3122
3225
|
|
3123
3226
|
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
3124
3227
|
|
3125
|
-
|
3228
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
3229
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
3126
3230
|
}
|
3127
|
-
}
|
3128
3231
|
|
3129
|
-
|
3130
|
-
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
3232
|
+
#pragma unroll
|
3233
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
3234
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
3235
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
3236
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3237
|
+
|
3238
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
3239
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
3240
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
3241
|
+
if (need_sum) {
|
3242
|
+
*dsi_dst = *dsi_src;
|
3243
|
+
} else {
|
3244
|
+
float * dfi_dst = (float *) dsi_dst;
|
3245
|
+
*dfi_dst = (*dsi_src).x;
|
3246
|
+
}
|
3247
|
+
}
|
3135
3248
|
|
3136
|
-
|
3249
|
+
__syncthreads();
|
3137
3250
|
|
3138
|
-
#
|
3139
|
-
|
3140
|
-
#endif // __CUDA_ARCH__ >= 700
|
3141
|
-
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3251
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
3252
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
3142
3253
|
#pragma unroll
|
3143
|
-
|
3254
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3144
3255
|
#pragma unroll
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3256
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3257
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
3258
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3259
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
3260
|
+
}
|
3148
3261
|
}
|
3149
3262
|
}
|
3150
|
-
}
|
3151
|
-
|
3152
|
-
__syncthreads();
|
3153
|
-
}
|
3154
3263
|
|
3155
|
-
|
3156
|
-
|
3157
|
-
return;
|
3264
|
+
__syncthreads();
|
3265
|
+
}
|
3158
3266
|
}
|
3159
3267
|
|
3160
|
-
|
3161
|
-
|
3268
|
+
#pragma unroll
|
3269
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3270
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3162
3271
|
|
3163
3272
|
if (col_dst >= ncols_dst) {
|
3164
3273
|
return;
|
3165
3274
|
}
|
3166
3275
|
|
3167
|
-
|
3168
|
-
|
3276
|
+
#pragma unroll
|
3277
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3278
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3279
|
+
|
3280
|
+
if (row_dst >= nrows_dst) {
|
3281
|
+
continue;
|
3282
|
+
}
|
3283
|
+
|
3284
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3169
3285
|
}
|
3170
3286
|
}
|
3171
3287
|
}
|
3172
3288
|
|
3289
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3290
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3291
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3292
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3293
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3294
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3295
|
+
|
3296
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3297
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3298
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3299
|
+
|
3300
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3301
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3302
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3303
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3304
|
+
|
3305
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3306
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3307
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3308
|
+
|
3309
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3310
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3311
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3312
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3313
|
+
|
3314
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3315
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3316
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3317
|
+
#else
|
3318
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3319
|
+
assert(false);
|
3320
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3321
|
+
}
|
3322
|
+
|
3323
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3324
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3325
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3326
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3327
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3328
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3329
|
+
|
3330
|
+
template <bool need_check> static __global__ void
|
3331
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3332
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3333
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3334
|
+
mul_mat_q4_1(
|
3335
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3336
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3337
|
+
|
3338
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3339
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3340
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3341
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3342
|
+
|
3343
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3344
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3345
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3346
|
+
|
3347
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3348
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3349
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3350
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3351
|
+
|
3352
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3353
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3354
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3355
|
+
#else
|
3356
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3357
|
+
assert(false);
|
3358
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3359
|
+
}
|
3360
|
+
|
3361
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3362
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3363
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3364
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3365
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3366
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3367
|
+
|
3368
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3369
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3370
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3371
|
+
|
3372
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3373
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3374
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3375
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3376
|
+
|
3377
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3378
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3379
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3380
|
+
|
3381
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3382
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3383
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3384
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3385
|
+
|
3386
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3387
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3388
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3389
|
+
#else
|
3390
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3391
|
+
assert(false);
|
3392
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3393
|
+
}
|
3394
|
+
|
3395
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3396
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3397
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3398
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3399
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3400
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3401
|
+
|
3402
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3403
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3404
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3405
|
+
|
3406
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3407
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3408
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3409
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3410
|
+
|
3411
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3412
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3413
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3414
|
+
|
3415
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3416
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3417
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3418
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3419
|
+
|
3420
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3421
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3422
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3423
|
+
#else
|
3424
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3425
|
+
assert(false);
|
3426
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3427
|
+
}
|
3428
|
+
|
3429
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3430
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3431
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3432
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3433
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3434
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3435
|
+
|
3436
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3446
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3455
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3464
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3465
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3466
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3468
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3480
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3489
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3499
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3500
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3502
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void
|
3505
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3506
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3507
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3508
|
+
mul_mat_q3_K(
|
3509
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3510
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3511
|
+
|
3512
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3513
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3514
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3515
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3516
|
+
|
3517
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3518
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3520
|
+
|
3521
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3522
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3523
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3524
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3525
|
+
|
3526
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3527
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3528
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3529
|
+
#else
|
3530
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3531
|
+
assert(false);
|
3532
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3533
|
+
}
|
3534
|
+
|
3535
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3536
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3537
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3538
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3539
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3540
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3541
|
+
|
3542
|
+
template <bool need_check> static __global__ void
|
3543
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3544
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3545
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3546
|
+
mul_mat_q4_K(
|
3547
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3548
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3549
|
+
|
3550
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3551
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3552
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3553
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3554
|
+
|
3555
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3556
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3557
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3558
|
+
|
3559
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3560
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3561
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3562
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3563
|
+
|
3564
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3565
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3566
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3567
|
+
#else
|
3568
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3569
|
+
assert(false);
|
3570
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3571
|
+
}
|
3572
|
+
|
3573
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3574
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3575
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3576
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3577
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3578
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3579
|
+
|
3580
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3581
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3582
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3583
|
+
|
3584
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3585
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3586
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3587
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3588
|
+
|
3589
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3590
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3591
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3592
|
+
|
3593
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3594
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3595
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3596
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3597
|
+
|
3598
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3599
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3600
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3601
|
+
#else
|
3602
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3603
|
+
assert(false);
|
3604
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3605
|
+
}
|
3606
|
+
|
3607
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3608
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3609
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3610
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3611
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3612
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3613
|
+
|
3614
|
+
template <bool need_check> static __global__ void
|
3615
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3616
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3617
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3618
|
+
mul_mat_q6_K(
|
3619
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3620
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3621
|
+
|
3622
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3623
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3624
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3625
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3626
|
+
|
3627
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3628
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3629
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3630
|
+
|
3631
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3632
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3633
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3634
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3635
|
+
|
3636
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3637
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3638
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3639
|
+
#else
|
3640
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3641
|
+
assert(false);
|
3642
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3643
|
+
}
|
3644
|
+
|
3173
3645
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3174
3646
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3175
3647
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3780,7 +4252,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3780
4252
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3781
4253
|
const dim3 block_nums(1, block_num_y, 1);
|
3782
4254
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3783
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
4255
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
3784
4256
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3785
4257
|
}
|
3786
4258
|
|
@@ -3789,7 +4261,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3789
4261
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3790
4262
|
const dim3 block_nums(1, block_num_y, 1);
|
3791
4263
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3792
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
4264
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
3793
4265
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3794
4266
|
}
|
3795
4267
|
|
@@ -3798,7 +4270,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3798
4270
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3799
4271
|
const dim3 block_nums(1, block_num_y, 1);
|
3800
4272
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3801
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
4273
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
3802
4274
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3803
4275
|
}
|
3804
4276
|
|
@@ -3807,7 +4279,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3807
4279
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3808
4280
|
const dim3 block_nums(1, block_num_y, 1);
|
3809
4281
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3810
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
4282
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
3811
4283
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3812
4284
|
}
|
3813
4285
|
|
@@ -3816,7 +4288,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3816
4288
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3817
4289
|
const dim3 block_nums(1, block_num_y, 1);
|
3818
4290
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3819
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
4291
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
3820
4292
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3821
4293
|
}
|
3822
4294
|
|
@@ -3867,17 +4339,36 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3867
4339
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
4340
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
4341
|
|
3870
|
-
|
3871
|
-
|
4342
|
+
int id;
|
4343
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4344
|
+
const int compute_capability = g_compute_capabilities[id];
|
4345
|
+
|
4346
|
+
int mmq_x, mmq_y, nwarps;
|
4347
|
+
if (compute_capability >= CC_TURING) {
|
4348
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4349
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4350
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4351
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4352
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4353
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4354
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
4355
|
+
} else {
|
4356
|
+
GGML_ASSERT(false);
|
4357
|
+
}
|
4358
|
+
|
4359
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4360
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3872
4361
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3873
|
-
const dim3 block_dims(WARP_SIZE,
|
4362
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3874
4363
|
|
3875
|
-
if (nrows_x %
|
3876
|
-
|
3877
|
-
|
4364
|
+
if (nrows_x % mmq_y == 0) {
|
4365
|
+
const bool need_check = false;
|
4366
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4367
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3878
4368
|
} else {
|
3879
|
-
|
3880
|
-
|
4369
|
+
const bool need_check = true;
|
4370
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4371
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3881
4372
|
}
|
3882
4373
|
}
|
3883
4374
|
|
@@ -3885,17 +4376,36 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3885
4376
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
4377
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
4378
|
|
3888
|
-
|
3889
|
-
|
4379
|
+
int id;
|
4380
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4381
|
+
const int compute_capability = g_compute_capabilities[id];
|
4382
|
+
|
4383
|
+
int mmq_x, mmq_y, nwarps;
|
4384
|
+
if (compute_capability >= CC_TURING) {
|
4385
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4386
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4387
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4388
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4389
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4390
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4391
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4392
|
+
} else {
|
4393
|
+
GGML_ASSERT(false);
|
4394
|
+
}
|
4395
|
+
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3890
4398
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3891
|
-
const dim3 block_dims(WARP_SIZE,
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3892
4400
|
|
3893
|
-
if (nrows_x %
|
3894
|
-
|
3895
|
-
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4404
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3896
4405
|
} else {
|
3897
|
-
|
3898
|
-
|
4406
|
+
const bool need_check = true;
|
4407
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4408
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3899
4409
|
}
|
3900
4410
|
}
|
3901
4411
|
|
@@ -3903,17 +4413,36 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
3903
4413
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
4414
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
4415
|
|
3906
|
-
|
3907
|
-
|
4416
|
+
int id;
|
4417
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4418
|
+
const int compute_capability = g_compute_capabilities[id];
|
4419
|
+
|
4420
|
+
int mmq_x, mmq_y, nwarps;
|
4421
|
+
if (compute_capability >= CC_TURING) {
|
4422
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4423
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4424
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4425
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4426
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4427
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4428
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4429
|
+
} else {
|
4430
|
+
GGML_ASSERT(false);
|
4431
|
+
}
|
4432
|
+
|
4433
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4434
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3908
4435
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3909
|
-
const dim3 block_dims(WARP_SIZE,
|
4436
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3910
4437
|
|
3911
|
-
if (nrows_x %
|
3912
|
-
|
3913
|
-
|
4438
|
+
if (nrows_x % mmq_y == 0) {
|
4439
|
+
const bool need_check = false;
|
4440
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4441
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3914
4442
|
} else {
|
3915
|
-
|
3916
|
-
|
4443
|
+
const bool need_check = true;
|
4444
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4445
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3917
4446
|
}
|
3918
4447
|
}
|
3919
4448
|
|
@@ -3921,17 +4450,36 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
3921
4450
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
4451
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
4452
|
|
3924
|
-
|
3925
|
-
|
4453
|
+
int id;
|
4454
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4455
|
+
const int compute_capability = g_compute_capabilities[id];
|
4456
|
+
|
4457
|
+
int mmq_x, mmq_y, nwarps;
|
4458
|
+
if (compute_capability >= CC_TURING) {
|
4459
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4460
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4461
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4462
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4463
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4464
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4465
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4466
|
+
} else {
|
4467
|
+
GGML_ASSERT(false);
|
4468
|
+
}
|
4469
|
+
|
4470
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4471
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3926
4472
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3927
|
-
const dim3 block_dims(WARP_SIZE,
|
4473
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3928
4474
|
|
3929
|
-
if (nrows_x %
|
3930
|
-
|
3931
|
-
|
4475
|
+
if (nrows_x % mmq_y == 0) {
|
4476
|
+
const bool need_check = false;
|
4477
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4478
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3932
4479
|
} else {
|
3933
|
-
|
3934
|
-
|
4480
|
+
const bool need_check = true;
|
4481
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4482
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3935
4483
|
}
|
3936
4484
|
}
|
3937
4485
|
|
@@ -3939,17 +4487,36 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
3939
4487
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
4488
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
4489
|
|
3942
|
-
|
3943
|
-
|
4490
|
+
int id;
|
4491
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4492
|
+
const int compute_capability = g_compute_capabilities[id];
|
4493
|
+
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
4495
|
+
if (compute_capability >= CC_TURING) {
|
4496
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4503
|
+
} else {
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3944
4509
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3945
|
-
const dim3 block_dims(WARP_SIZE,
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3946
4511
|
|
3947
|
-
if (nrows_x %
|
3948
|
-
|
3949
|
-
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3950
4516
|
} else {
|
3951
|
-
|
3952
|
-
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3953
4520
|
}
|
3954
4521
|
}
|
3955
4522
|
|
@@ -3957,17 +4524,36 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
3957
4524
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
4525
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
4526
|
|
3960
|
-
|
3961
|
-
|
4527
|
+
int id;
|
4528
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4529
|
+
const int compute_capability = g_compute_capabilities[id];
|
4530
|
+
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
4532
|
+
if (compute_capability >= CC_TURING) {
|
4533
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4540
|
+
} else {
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4543
|
+
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3962
4546
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3963
|
-
const dim3 block_dims(WARP_SIZE,
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3964
4548
|
|
3965
|
-
if (nrows_x %
|
3966
|
-
|
3967
|
-
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
4553
|
} else {
|
3969
|
-
|
3970
|
-
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3971
4557
|
}
|
3972
4558
|
}
|
3973
4559
|
|
@@ -3975,17 +4561,36 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
3975
4561
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
4562
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
4563
|
|
3978
|
-
|
3979
|
-
|
4564
|
+
int id;
|
4565
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4566
|
+
const int compute_capability = g_compute_capabilities[id];
|
4567
|
+
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4569
|
+
if (compute_capability >= CC_TURING) {
|
4570
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4577
|
+
} else {
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3980
4583
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3981
|
-
const dim3 block_dims(WARP_SIZE,
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3982
4585
|
|
3983
|
-
if (nrows_x %
|
3984
|
-
|
3985
|
-
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
4590
|
} else {
|
3987
|
-
|
3988
|
-
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3989
4594
|
}
|
3990
4595
|
}
|
3991
4596
|
|
@@ -3993,17 +4598,36 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
3993
4598
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
4599
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
4600
|
|
3996
|
-
|
3997
|
-
|
4601
|
+
int id;
|
4602
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4603
|
+
const int compute_capability = g_compute_capabilities[id];
|
4604
|
+
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4606
|
+
if (compute_capability >= CC_TURING) {
|
4607
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4614
|
+
} else {
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3998
4620
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3999
|
-
const dim3 block_dims(WARP_SIZE,
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4000
4622
|
|
4001
|
-
if (nrows_x %
|
4002
|
-
|
4003
|
-
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4004
4627
|
} else {
|
4005
|
-
|
4006
|
-
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4007
4631
|
}
|
4008
4632
|
}
|
4009
4633
|
|
@@ -4011,17 +4635,36 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4011
4635
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
4636
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
4637
|
|
4014
|
-
|
4015
|
-
|
4638
|
+
int id;
|
4639
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4640
|
+
const int compute_capability = g_compute_capabilities[id];
|
4641
|
+
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4643
|
+
if (compute_capability >= CC_TURING) {
|
4644
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4651
|
+
} else {
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4016
4657
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4017
|
-
const dim3 block_dims(WARP_SIZE,
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4018
4659
|
|
4019
|
-
if (nrows_x %
|
4020
|
-
|
4021
|
-
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4022
4664
|
} else {
|
4023
|
-
|
4024
|
-
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4025
4668
|
}
|
4026
4669
|
}
|
4027
4670
|
|
@@ -4029,17 +4672,36 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4029
4672
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
4673
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
4674
|
|
4032
|
-
|
4033
|
-
|
4675
|
+
int id;
|
4676
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4677
|
+
const int compute_capability = g_compute_capabilities[id];
|
4678
|
+
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4680
|
+
if (compute_capability >= CC_TURING) {
|
4681
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4688
|
+
} else {
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4034
4694
|
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4035
|
-
const dim3 block_dims(WARP_SIZE,
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4036
4696
|
|
4037
|
-
if (nrows_x %
|
4038
|
-
|
4039
|
-
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4040
4701
|
} else {
|
4041
|
-
|
4042
|
-
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4043
4705
|
}
|
4044
4706
|
}
|
4045
4707
|
|
@@ -4214,20 +4876,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
4214
4876
|
}
|
4215
4877
|
|
4216
4878
|
|
4217
|
-
static void * g_scratch_buffer = nullptr;
|
4218
|
-
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
4219
|
-
static size_t g_scratch_offset = 0;
|
4220
|
-
|
4221
|
-
static int g_device_count = -1;
|
4222
|
-
static int g_main_device = 0;
|
4223
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
4224
|
-
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
-
static bool g_mul_mat_q = false;
|
4226
|
-
|
4227
|
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
4228
|
-
|
4229
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
4230
|
-
|
4231
4879
|
void ggml_init_cublas() {
|
4232
4880
|
static bool initialized = false;
|
4233
4881
|
|
@@ -4583,6 +5231,37 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
4583
5231
|
(void) i1;
|
4584
5232
|
}
|
4585
5233
|
|
5234
|
+
static int64_t get_row_rounding(ggml_type type) {
|
5235
|
+
int max_compute_capability = INT_MIN;
|
5236
|
+
for (int id = 0; id < g_device_count; ++id) {
|
5237
|
+
if (max_compute_capability < g_compute_capabilities[id]
|
5238
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5239
|
+
max_compute_capability = g_compute_capabilities[id];
|
5240
|
+
}
|
5241
|
+
}
|
5242
|
+
|
5243
|
+
switch(type) {
|
5244
|
+
case GGML_TYPE_Q4_0:
|
5245
|
+
case GGML_TYPE_Q4_1:
|
5246
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5247
|
+
case GGML_TYPE_Q5_0:
|
5248
|
+
case GGML_TYPE_Q5_1:
|
5249
|
+
case GGML_TYPE_Q8_0:
|
5250
|
+
return 64;
|
5251
|
+
case GGML_TYPE_F16:
|
5252
|
+
return 1;
|
5253
|
+
case GGML_TYPE_Q2_K:
|
5254
|
+
case GGML_TYPE_Q3_K:
|
5255
|
+
case GGML_TYPE_Q4_K:
|
5256
|
+
case GGML_TYPE_Q5_K:
|
5257
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5258
|
+
case GGML_TYPE_Q6_K:
|
5259
|
+
return 64;
|
5260
|
+
default:
|
5261
|
+
GGML_ASSERT(false);
|
5262
|
+
}
|
5263
|
+
}
|
5264
|
+
|
4586
5265
|
inline void ggml_cuda_op_mul_mat_vec(
|
4587
5266
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4588
5267
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -4983,14 +5662,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
4983
5662
|
|
4984
5663
|
int64_t row_low, row_high;
|
4985
5664
|
if (split) {
|
5665
|
+
const int64_t rounding = get_row_rounding(src0->type);
|
5666
|
+
|
4986
5667
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
4987
|
-
row_low -= row_low %
|
5668
|
+
row_low -= row_low % rounding;
|
4988
5669
|
|
4989
5670
|
if (id == g_device_count - 1) {
|
4990
5671
|
row_high = nrows0;
|
4991
5672
|
} else {
|
4992
5673
|
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
-
row_high -= row_high %
|
5674
|
+
row_high -= row_high % rounding;
|
4994
5675
|
}
|
4995
5676
|
} else {
|
4996
5677
|
row_low = 0;
|
@@ -5203,7 +5884,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
5203
5884
|
if (split && g_device_count > 1) {
|
5204
5885
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5205
5886
|
for (int id = 0; id < g_device_count; ++id) {
|
5206
|
-
if (id != g_main_device) {
|
5887
|
+
if (id != g_main_device && src0_extra->events[id]) {
|
5207
5888
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
5208
5889
|
}
|
5209
5890
|
}
|
@@ -5347,7 +6028,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
5347
6028
|
} else {
|
5348
6029
|
int min_compute_capability = INT_MAX;
|
5349
6030
|
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6031
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6032
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5351
6033
|
min_compute_capability = g_compute_capabilities[id];
|
5352
6034
|
}
|
5353
6035
|
}
|
@@ -5468,14 +6150,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
5468
6150
|
row_low = 0;
|
5469
6151
|
row_high = nrows;
|
5470
6152
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6153
|
+
const int64_t rounding = get_row_rounding(tensor->type);
|
6154
|
+
|
5471
6155
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
5472
|
-
row_low -= row_low %
|
6156
|
+
row_low -= row_low % rounding;
|
5473
6157
|
|
5474
6158
|
if (id == g_device_count - 1) {
|
5475
6159
|
row_high = nrows;
|
5476
6160
|
} else {
|
5477
6161
|
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
-
row_high -= row_high %
|
6162
|
+
row_high -= row_high % rounding;
|
5479
6163
|
}
|
5480
6164
|
} else {
|
5481
6165
|
GGML_ASSERT(false);
|
@@ -5785,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
5785
6469
|
func(tensor->src[0], tensor->src[1], tensor);
|
5786
6470
|
return true;
|
5787
6471
|
}
|
6472
|
+
|
6473
|
+
int ggml_cuda_get_device_count() {
|
6474
|
+
int device_count;
|
6475
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6476
|
+
return device_count;
|
6477
|
+
}
|
6478
|
+
|
6479
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6480
|
+
cudaDeviceProp prop;
|
6481
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6482
|
+
snprintf(description, description_size, "%s", prop.name);
|
6483
|
+
}
|