llama_cpp 0.3.6 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-alloc.c +8 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1165 -721
- data/ext/llama_cpp/src/ggml-metal.m +39 -18
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +41 -1
- data/ext/llama_cpp/src/llama.cpp +214 -146
- data/ext/llama_cpp/src/llama.h +18 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
@@ -14,6 +14,7 @@
|
|
14
14
|
#include "ggml.h"
|
15
15
|
|
16
16
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
17
|
+
#define CC_TURING 700
|
17
18
|
|
18
19
|
#if defined(_MSC_VER)
|
19
20
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
262
263
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
263
264
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
264
265
|
|
265
|
-
#ifndef GGML_CUDA_MMQ_Y
|
266
|
-
#define GGML_CUDA_MMQ_Y 64
|
267
|
-
#endif // GGML_CUDA_MMQ_Y
|
268
|
-
|
269
266
|
// dmmv = dequantize_mul_mat_vec
|
270
267
|
#ifndef GGML_CUDA_DMMV_X
|
271
268
|
#define GGML_CUDA_DMMV_X 32
|
@@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu {
|
|
285
282
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
286
283
|
};
|
287
284
|
|
285
|
+
static int g_device_count = -1;
|
286
|
+
static int g_main_device = 0;
|
287
|
+
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
|
+
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
+
static bool g_mul_mat_q = false;
|
290
|
+
|
291
|
+
static void * g_scratch_buffer = nullptr;
|
292
|
+
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
293
|
+
static size_t g_scratch_offset = 0;
|
294
|
+
|
295
|
+
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
296
|
+
|
297
|
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
298
|
+
|
288
299
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
289
300
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
290
301
|
|
@@ -1383,8 +1394,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1383
1394
|
sumi = __dp4a(vi1, u[2*i+1], sumi);
|
1384
1395
|
}
|
1385
1396
|
|
1397
|
+
const float2 ds8f = __half22float2(ds8);
|
1398
|
+
|
1386
1399
|
// second part effectively subtracts 8 from each quant value
|
1387
|
-
return d4 * (sumi *
|
1400
|
+
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1388
1401
|
#else
|
1389
1402
|
return 0.0f; // only to satisfy the compiler
|
1390
1403
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1410,12 +1423,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1410
1423
|
}
|
1411
1424
|
|
1412
1425
|
#ifdef GGML_CUDA_F16
|
1413
|
-
const
|
1414
|
-
const float d4d8 =
|
1415
|
-
const float m4s8 =
|
1426
|
+
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
1427
|
+
const float d4d8 = tmp.x;
|
1428
|
+
const float m4s8 = tmp.y;
|
1416
1429
|
#else
|
1417
|
-
const
|
1418
|
-
const
|
1430
|
+
const float2 dm4f = __half22float2(dm4);
|
1431
|
+
const float2 ds8f = __half22float2(ds8);
|
1432
|
+
const float d4d8 = dm4f.x * ds8f.x;
|
1433
|
+
const float m4s8 = dm4f.y * ds8f.y;
|
1419
1434
|
#endif // GGML_CUDA_F16
|
1420
1435
|
|
1421
1436
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
@@ -1434,6 +1449,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1434
1449
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1435
1450
|
int sumi = 0;
|
1436
1451
|
|
1452
|
+
#pragma unroll
|
1437
1453
|
for (int i = 0; i < vdr; ++i) {
|
1438
1454
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1439
1455
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1450,8 +1466,10 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1450
1466
|
sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
1451
1467
|
}
|
1452
1468
|
|
1469
|
+
const float2 ds8f = __half22float2(ds8);
|
1470
|
+
|
1453
1471
|
// second part effectively subtracts 16 from each quant value
|
1454
|
-
return d5 * (sumi*
|
1472
|
+
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1455
1473
|
#else
|
1456
1474
|
return 0.0f; // only to satisfy the compiler
|
1457
1475
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1466,6 +1484,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1466
1484
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1467
1485
|
int sumi = 0;
|
1468
1486
|
|
1487
|
+
#pragma unroll
|
1469
1488
|
for (int i = 0; i < vdr; ++i) {
|
1470
1489
|
int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
|
1471
1490
|
vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
|
@@ -1483,12 +1502,14 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1483
1502
|
}
|
1484
1503
|
|
1485
1504
|
#ifdef GGML_CUDA_F16
|
1486
|
-
const
|
1487
|
-
const float d5d8 =
|
1488
|
-
const float m5s8 =
|
1505
|
+
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
1506
|
+
const float d5d8 = tmp.x;
|
1507
|
+
const float m5s8 = tmp.y;
|
1489
1508
|
#else
|
1490
|
-
const
|
1491
|
-
const
|
1509
|
+
const float2 dm5f = __half22float2(dm5);
|
1510
|
+
const float2 ds8f = __half22float2(ds8);
|
1511
|
+
const float d5d8 = dm5f.x * ds8f.x;
|
1512
|
+
const float m5s8 = dm5f.y * ds8f.y;
|
1492
1513
|
#endif // GGML_CUDA_F16
|
1493
1514
|
|
1494
1515
|
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
@@ -1503,17 +1524,18 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1503
1524
|
#define VDR_Q8_0_Q8_1_MMQ 8
|
1504
1525
|
|
1505
1526
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
|
1506
|
-
const int * v, const int * u, const float & d8_0, const
|
1527
|
+
const int * v, const int * u, const float & d8_0, const float & d8_1) {
|
1507
1528
|
|
1508
1529
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1509
1530
|
int sumi = 0;
|
1510
1531
|
|
1532
|
+
#pragma unroll
|
1511
1533
|
for (int i = 0; i < vdr; ++i) {
|
1512
1534
|
// SIMD dot product of quantized values
|
1513
1535
|
sumi = __dp4a(v[i], u[i], sumi);
|
1514
1536
|
}
|
1515
1537
|
|
1516
|
-
return
|
1538
|
+
return d8_0*d8_1 * sumi;
|
1517
1539
|
#else
|
1518
1540
|
return 0.0f; // only to satisfy the compiler
|
1519
1541
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
@@ -1525,18 +1547,21 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1525
1547
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1526
1548
|
int sumi = 0;
|
1527
1549
|
|
1550
|
+
#pragma unroll
|
1528
1551
|
for (int i = 0; i < vdr; ++i) {
|
1529
1552
|
// SIMD dot product of quantized values
|
1530
1553
|
sumi = __dp4a(v[i], u[i], sumi);
|
1531
1554
|
}
|
1532
1555
|
|
1533
1556
|
#ifdef GGML_CUDA_F16
|
1534
|
-
const
|
1535
|
-
const float d8d8 =
|
1536
|
-
const float m8s8 =
|
1557
|
+
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
1558
|
+
const float d8d8 = tmp.x;
|
1559
|
+
const float m8s8 = tmp.y;
|
1537
1560
|
#else
|
1538
|
-
const
|
1539
|
-
const
|
1561
|
+
const float2 dm8f = __half22float2(dm8);
|
1562
|
+
const float2 ds8f = __half22float2(ds8);
|
1563
|
+
const float d8d8 = dm8f.x * ds8f.x;
|
1564
|
+
const float m8s8 = dm8f.y * ds8f.y;
|
1540
1565
|
#endif // GGML_CUDA_F16
|
1541
1566
|
|
1542
1567
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
@@ -1546,6 +1571,312 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1546
1571
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1547
1572
|
}
|
1548
1573
|
|
1574
|
+
#define VDR_Q2_K_Q8_1_MMVQ 1
|
1575
|
+
#define VDR_Q2_K_Q8_1_MMQ 2
|
1576
|
+
|
1577
|
+
// contiguous v/x values
|
1578
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
1579
|
+
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1580
|
+
const half2 & dm2, const float * __restrict__ d8) {
|
1581
|
+
|
1582
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1583
|
+
float sumf_d = 0.0f;
|
1584
|
+
float sumf_m = 0.0f;
|
1585
|
+
|
1586
|
+
#pragma unroll
|
1587
|
+
for (int i = 0; i < QR2_K; ++i) {
|
1588
|
+
const int sc = scales[2*i];
|
1589
|
+
|
1590
|
+
const int vi = (v >> (2*i)) & 0x03030303;
|
1591
|
+
|
1592
|
+
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
1593
|
+
|
1594
|
+
// fill int with 4x m
|
1595
|
+
int m = sc >> 4;
|
1596
|
+
m |= m << 8;
|
1597
|
+
m |= m << 16;
|
1598
|
+
sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
1599
|
+
}
|
1600
|
+
|
1601
|
+
const float2 dm2f = __half22float2(dm2);
|
1602
|
+
|
1603
|
+
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
|
+
#else
|
1605
|
+
return 0.0f; // only to satisfy the compiler
|
1606
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
|
+
}
|
1608
|
+
|
1609
|
+
// contiguous u/y values
|
1610
|
+
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
1611
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1612
|
+
const half2 & dm2, const float & d8) {
|
1613
|
+
|
1614
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1615
|
+
int sumi_d = 0;
|
1616
|
+
int sumi_m = 0;
|
1617
|
+
|
1618
|
+
#pragma unroll
|
1619
|
+
for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
|
1620
|
+
int sumi_d_sc = 0;
|
1621
|
+
|
1622
|
+
const int sc = scales[i0 / (QI8_1/2)];
|
1623
|
+
|
1624
|
+
// fill int with 4x m
|
1625
|
+
int m = sc >> 4;
|
1626
|
+
m |= m << 8;
|
1627
|
+
m |= m << 16;
|
1628
|
+
|
1629
|
+
#pragma unroll
|
1630
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1631
|
+
sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
|
1632
|
+
sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m
|
1633
|
+
}
|
1634
|
+
|
1635
|
+
sumi_d += sumi_d_sc * (sc & 0xF);
|
1636
|
+
}
|
1637
|
+
|
1638
|
+
const float2 dm2f = __half22float2(dm2);
|
1639
|
+
|
1640
|
+
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
|
+
#else
|
1642
|
+
return 0.0f; // only to satisfy the compiler
|
1643
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
|
+
}
|
1645
|
+
|
1646
|
+
#define VDR_Q3_K_Q8_1_MMVQ 1
|
1647
|
+
#define VDR_Q3_K_Q8_1_MMQ 2
|
1648
|
+
|
1649
|
+
// contiguous v/x values
|
1650
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
1651
|
+
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
1652
|
+
const int & scale_offset, const float & d3, const float * __restrict__ d8) {
|
1653
|
+
|
1654
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1655
|
+
float sumf = 0.0f;
|
1656
|
+
|
1657
|
+
#pragma unroll
|
1658
|
+
for (int i = 0; i < QR3_K; ++i) {
|
1659
|
+
const int isc = scale_offset + 2*i;
|
1660
|
+
|
1661
|
+
const int isc_low = isc % (QK_K/32);
|
1662
|
+
const int sc_shift_low = 4 * (isc / (QK_K/32));
|
1663
|
+
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
1664
|
+
|
1665
|
+
const int isc_high = isc % (QK_K/64);
|
1666
|
+
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
1667
|
+
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
1668
|
+
|
1669
|
+
const int sc = (sc_low | sc_high) - 32;
|
1670
|
+
|
1671
|
+
const int vil = (vl >> (2*i)) & 0x03030303;
|
1672
|
+
|
1673
|
+
const int vih = ((vh >> i) << 2) & 0x04040404;
|
1674
|
+
|
1675
|
+
const int vi = __vsubss4(vil, vih);
|
1676
|
+
|
1677
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1678
|
+
}
|
1679
|
+
|
1680
|
+
return d3 * sumf;
|
1681
|
+
#else
|
1682
|
+
return 0.0f; // only to satisfy the compiler
|
1683
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
// contiguous u/y values
|
1687
|
+
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
1688
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1689
|
+
const float & d3, const float & d8) {
|
1690
|
+
|
1691
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1692
|
+
int sumi = 0;
|
1693
|
+
|
1694
|
+
#pragma unroll
|
1695
|
+
for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
|
1696
|
+
int sumi_sc = 0;
|
1697
|
+
|
1698
|
+
for (int i = i0; i < i0 + QI8_1/2; ++i) {
|
1699
|
+
sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
|
1700
|
+
}
|
1701
|
+
|
1702
|
+
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
|
1703
|
+
}
|
1704
|
+
|
1705
|
+
return d3*d8 * sumi;
|
1706
|
+
#else
|
1707
|
+
return 0.0f; // only to satisfy the compiler
|
1708
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
|
+
}
|
1710
|
+
|
1711
|
+
#define VDR_Q4_K_Q8_1_MMVQ 2
|
1712
|
+
#define VDR_Q4_K_Q8_1_MMQ 8
|
1713
|
+
|
1714
|
+
// contiguous v/x values
|
1715
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
1716
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1717
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
1718
|
+
|
1719
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1720
|
+
float sumf_d = 0.0f;
|
1721
|
+
float sumf_m = 0.0f;
|
1722
|
+
|
1723
|
+
#pragma unroll
|
1724
|
+
for (int i = 0; i < QR4_K; ++i) {
|
1725
|
+
const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
|
1726
|
+
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
|
1727
|
+
|
1728
|
+
const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
|
1729
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
|
1730
|
+
|
1731
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1732
|
+
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
1733
|
+
}
|
1734
|
+
|
1735
|
+
const float2 dm4f = __half22float2(dm4);
|
1736
|
+
|
1737
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
|
+
|
1739
|
+
#else
|
1740
|
+
return 0.0f; // only to satisfy the compiler
|
1741
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
|
+
}
|
1743
|
+
|
1744
|
+
// contiguous u/y values
|
1745
|
+
// also used for q5_K
|
1746
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1749
|
+
|
1750
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1751
|
+
float sumf_d = 0.0f;
|
1752
|
+
float sumf_m = 0.0f;
|
1753
|
+
|
1754
|
+
#pragma unroll
|
1755
|
+
for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
|
1756
|
+
int sumi_d = 0;
|
1757
|
+
|
1758
|
+
#pragma unroll
|
1759
|
+
for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
|
1760
|
+
sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
|
1761
|
+
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1762
|
+
}
|
1763
|
+
|
1764
|
+
const float2 ds8f = __half22float2(ds8[i0 / 4]);
|
1765
|
+
|
1766
|
+
sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
|
1767
|
+
sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val
|
1768
|
+
}
|
1769
|
+
|
1770
|
+
const float2 dm4f = __half22float2(dm4);
|
1771
|
+
|
1772
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
|
+
|
1774
|
+
#else
|
1775
|
+
return 0.0f; // only to satisfy the compiler
|
1776
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
|
+
}
|
1778
|
+
|
1779
|
+
#define VDR_Q5_K_Q8_1_MMVQ 2
|
1780
|
+
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
|
+
|
1782
|
+
// contiguous v/x values
|
1783
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
1784
|
+
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
|
+
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
|
+
|
1787
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1788
|
+
float sumf_d = 0.0f;
|
1789
|
+
float sumf_m = 0.0f;
|
1790
|
+
|
1791
|
+
#pragma unroll
|
1792
|
+
for (int i = 0; i < QR5_K; ++i) {
|
1793
|
+
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
1794
|
+
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
1795
|
+
|
1796
|
+
const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
|
1797
|
+
const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
|
1798
|
+
|
1799
|
+
const int v0i = vl0i | vh0i;
|
1800
|
+
const int v1i = vl1i | vh1i;
|
1801
|
+
|
1802
|
+
const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
|
1803
|
+
const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
|
1804
|
+
|
1805
|
+
sumf_d += d8[i] * (dot1 * sc[i]);
|
1806
|
+
sumf_m += d8[i] * (dot2 * m[i]);
|
1807
|
+
|
1808
|
+
}
|
1809
|
+
|
1810
|
+
const float2 dm5f = __half22float2(dm5);
|
1811
|
+
|
1812
|
+
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
|
+
|
1814
|
+
#else
|
1815
|
+
return 0.0f; // only to satisfy the compiler
|
1816
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
|
+
}
|
1818
|
+
|
1819
|
+
#define VDR_Q6_K_Q8_1_MMVQ 1
|
1820
|
+
#define VDR_Q6_K_Q8_1_MMQ 8
|
1821
|
+
|
1822
|
+
// contiguous v/x values
|
1823
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
1824
|
+
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
1825
|
+
const float & d, const float * __restrict__ d8) {
|
1826
|
+
|
1827
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1828
|
+
float sumf = 0.0f;
|
1829
|
+
|
1830
|
+
#pragma unroll
|
1831
|
+
for (int i = 0; i < QR6_K; ++i) {
|
1832
|
+
const int sc = scales[4*i];
|
1833
|
+
|
1834
|
+
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
1835
|
+
|
1836
|
+
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
1837
|
+
|
1838
|
+
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
1839
|
+
|
1840
|
+
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
1841
|
+
}
|
1842
|
+
|
1843
|
+
return d*sumf;
|
1844
|
+
#else
|
1845
|
+
return 0.0f; // only to satisfy the compiler
|
1846
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
|
+
}
|
1848
|
+
|
1849
|
+
// contiguous u/y values
|
1850
|
+
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
1851
|
+
const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
|
1852
|
+
const float & d6, const float * __restrict__ d8) {
|
1853
|
+
|
1854
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1855
|
+
float sumf_d = 0.0f;
|
1856
|
+
|
1857
|
+
#pragma unroll
|
1858
|
+
for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
|
1859
|
+
int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
|
1860
|
+
|
1861
|
+
#pragma unroll
|
1862
|
+
for (int i = i0; i < i0 + 2; ++i) {
|
1863
|
+
sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
|
1864
|
+
sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
|
1865
|
+
|
1866
|
+
sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
|
1867
|
+
sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
|
1868
|
+
}
|
1869
|
+
|
1870
|
+
sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
|
1871
|
+
}
|
1872
|
+
|
1873
|
+
return d6 * sumf_d;
|
1874
|
+
|
1875
|
+
#else
|
1876
|
+
return 0.0f; // only to satisfy the compiler
|
1877
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
|
+
}
|
1879
|
+
|
1549
1880
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
1550
1881
|
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
1551
1882
|
|
@@ -1564,21 +1895,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
1564
1895
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
|
1565
1896
|
}
|
1566
1897
|
|
1567
|
-
static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1898
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1568
1899
|
|
1569
|
-
__shared__ int tile_x_qs[
|
1570
|
-
__shared__ float tile_x_d[
|
1900
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
1901
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
1571
1902
|
|
1572
1903
|
*x_ql = tile_x_qs;
|
1573
1904
|
*x_dm = (half2 *) tile_x_d;
|
1574
1905
|
}
|
1575
1906
|
|
1576
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1907
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
1577
1908
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1578
1909
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1579
1910
|
|
1580
1911
|
__builtin_assume(i_offset >= 0);
|
1581
|
-
__builtin_assume(i_offset <
|
1912
|
+
__builtin_assume(i_offset < nwarps);
|
1582
1913
|
__builtin_assume(k >= 0);
|
1583
1914
|
__builtin_assume(k < WARP_SIZE);
|
1584
1915
|
|
@@ -1590,7 +1921,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1590
1921
|
float * x_dmf = (float *) x_dm;
|
1591
1922
|
|
1592
1923
|
#pragma unroll
|
1593
|
-
for (int i0 = 0; i0 <
|
1924
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1594
1925
|
int i = i0 + i_offset;
|
1595
1926
|
|
1596
1927
|
if (need_check) {
|
@@ -1600,38 +1931,30 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1600
1931
|
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
1601
1932
|
|
1602
1933
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
|
1603
|
-
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1934
|
+
// x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
|
1604
1935
|
}
|
1605
1936
|
|
1606
|
-
|
1607
|
-
|
1937
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
|
1938
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
1608
1939
|
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1940
|
+
#pragma unroll
|
1941
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
|
1942
|
+
int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
|
1613
1943
|
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1944
|
+
if (need_check) {
|
1945
|
+
i = min(i, i_max);
|
1946
|
+
}
|
1617
1947
|
|
1618
|
-
|
1948
|
+
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
1619
1949
|
|
1620
|
-
|
1621
|
-
|
1950
|
+
x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
|
1951
|
+
}
|
1622
1952
|
}
|
1623
1953
|
|
1624
1954
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
1625
1955
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1626
1956
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1627
1957
|
|
1628
|
-
__builtin_assume(i >= 0);
|
1629
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1630
|
-
__builtin_assume(j >= 0);
|
1631
|
-
__builtin_assume(j < WARP_SIZE);
|
1632
|
-
__builtin_assume(k >= 0);
|
1633
|
-
__builtin_assume(k < WARP_SIZE);
|
1634
|
-
|
1635
1958
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1636
1959
|
const float * x_dmf = (float *) x_dm;
|
1637
1960
|
|
@@ -1639,13 +1962,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
|
1639
1962
|
|
1640
1963
|
#pragma unroll
|
1641
1964
|
for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
|
1642
|
-
u[2*l+0] = y_qs[j *
|
1643
|
-
u[2*l+1] = y_qs[j *
|
1965
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
1966
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
|
1644
1967
|
}
|
1645
1968
|
|
1646
1969
|
return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
|
1647
1970
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
|
1648
|
-
y_ds[j * (
|
1971
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1649
1972
|
}
|
1650
1973
|
|
1651
1974
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
@@ -1666,21 +1989,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
1666
1989
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
|
1667
1990
|
}
|
1668
1991
|
|
1669
|
-
static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1992
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1670
1993
|
|
1671
|
-
__shared__ int tile_x_qs[
|
1672
|
-
__shared__ half2 tile_x_dm[
|
1994
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
1995
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
1673
1996
|
|
1674
1997
|
*x_ql = tile_x_qs;
|
1675
1998
|
*x_dm = tile_x_dm;
|
1676
1999
|
}
|
1677
2000
|
|
1678
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2001
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
1679
2002
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1680
2003
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1681
2004
|
|
1682
2005
|
__builtin_assume(i_offset >= 0);
|
1683
|
-
__builtin_assume(i_offset <
|
2006
|
+
__builtin_assume(i_offset < nwarps);
|
1684
2007
|
__builtin_assume(k >= 0);
|
1685
2008
|
__builtin_assume(k < WARP_SIZE);
|
1686
2009
|
|
@@ -1690,7 +2013,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1690
2013
|
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
1691
2014
|
|
1692
2015
|
#pragma unroll
|
1693
|
-
for (int i0 = 0; i0 <
|
2016
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1694
2017
|
int i = i0 + i_offset;
|
1695
2018
|
|
1696
2019
|
if (need_check) {
|
@@ -1706,7 +2029,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
1706
2029
|
const int kbxd = k % blocks_per_tile_x_row;
|
1707
2030
|
|
1708
2031
|
#pragma unroll
|
1709
|
-
for (int i0 = 0; i0 <
|
2032
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
|
1710
2033
|
int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
|
1711
2034
|
|
1712
2035
|
if (need_check) {
|
@@ -1723,26 +2046,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
|
1723
2046
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1724
2047
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1725
2048
|
|
1726
|
-
__builtin_assume(i >= 0);
|
1727
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1728
|
-
__builtin_assume(j >= 0);
|
1729
|
-
__builtin_assume(j < WARP_SIZE);
|
1730
|
-
__builtin_assume(k >= 0);
|
1731
|
-
__builtin_assume(k < WARP_SIZE);
|
1732
|
-
|
1733
2049
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1734
2050
|
|
1735
2051
|
int u[2*VDR_Q4_1_Q8_1_MMQ];
|
1736
2052
|
|
1737
2053
|
#pragma unroll
|
1738
2054
|
for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
|
1739
|
-
u[2*l+0] = y_qs[j *
|
1740
|
-
u[2*l+1] = y_qs[j *
|
2055
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2056
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
|
1741
2057
|
}
|
1742
2058
|
|
1743
2059
|
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
|
1744
2060
|
(&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
|
1745
|
-
y_ds[j * (
|
2061
|
+
y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1746
2062
|
}
|
1747
2063
|
|
1748
2064
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
@@ -1765,21 +2081,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
1765
2081
|
return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
|
1766
2082
|
}
|
1767
2083
|
|
1768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2084
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1769
2085
|
|
1770
|
-
__shared__ int tile_x_ql[
|
1771
|
-
__shared__ float tile_x_d[
|
2086
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2087
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
1772
2088
|
|
1773
2089
|
*x_ql = tile_x_ql;
|
1774
2090
|
*x_dm = (half2 *) tile_x_d;
|
1775
2091
|
}
|
1776
2092
|
|
1777
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2093
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
1778
2094
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1779
2095
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1780
2096
|
|
1781
2097
|
__builtin_assume(i_offset >= 0);
|
1782
|
-
__builtin_assume(i_offset <
|
2098
|
+
__builtin_assume(i_offset < nwarps);
|
1783
2099
|
__builtin_assume(k >= 0);
|
1784
2100
|
__builtin_assume(k < WARP_SIZE);
|
1785
2101
|
|
@@ -1789,7 +2105,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1789
2105
|
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
1790
2106
|
|
1791
2107
|
#pragma unroll
|
1792
|
-
for (int i0 = 0; i0 <
|
2108
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1793
2109
|
int i = i0 + i_offset;
|
1794
2110
|
|
1795
2111
|
if (need_check) {
|
@@ -1825,7 +2141,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1825
2141
|
float * x_dmf = (float *) x_dm;
|
1826
2142
|
|
1827
2143
|
#pragma unroll
|
1828
|
-
for (int i0 = 0; i0 <
|
2144
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
|
1829
2145
|
int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
|
1830
2146
|
|
1831
2147
|
if (need_check) {
|
@@ -1842,27 +2158,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
|
1842
2158
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1843
2159
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1844
2160
|
|
1845
|
-
__builtin_assume(i >= 0);
|
1846
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1847
|
-
__builtin_assume(j >= 0);
|
1848
|
-
__builtin_assume(j < WARP_SIZE);
|
1849
|
-
__builtin_assume(k >= 0);
|
1850
|
-
__builtin_assume(k < WARP_SIZE);
|
1851
|
-
|
1852
2161
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1853
2162
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
1854
|
-
const float * x_dmf = (float *) x_dm;
|
2163
|
+
const float * x_dmf = (const float *) x_dm;
|
2164
|
+
const float * y_df = (const float *) y_ds;
|
1855
2165
|
|
1856
2166
|
int u[2*VDR_Q5_0_Q8_1_MMQ];
|
1857
2167
|
|
1858
2168
|
#pragma unroll
|
1859
2169
|
for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
|
1860
|
-
u[2*l+0] = y_qs[j *
|
1861
|
-
u[2*l+1] = y_qs[j *
|
2170
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2171
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
|
1862
2172
|
}
|
1863
2173
|
|
1864
2174
|
return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
|
1865
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
|
2175
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1866
2176
|
}
|
1867
2177
|
|
1868
2178
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
@@ -1885,21 +2195,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
1885
2195
|
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
|
1886
2196
|
}
|
1887
2197
|
|
1888
|
-
static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2198
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
1889
2199
|
|
1890
|
-
__shared__ int tile_x_ql[
|
1891
|
-
__shared__ half2 tile_x_dm[
|
2200
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2201
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
1892
2202
|
|
1893
2203
|
*x_ql = tile_x_ql;
|
1894
2204
|
*x_dm = tile_x_dm;
|
1895
2205
|
}
|
1896
2206
|
|
1897
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2207
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
1898
2208
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
1899
2209
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
1900
2210
|
|
1901
2211
|
__builtin_assume(i_offset >= 0);
|
1902
|
-
__builtin_assume(i_offset <
|
2212
|
+
__builtin_assume(i_offset < nwarps);
|
1903
2213
|
__builtin_assume(k >= 0);
|
1904
2214
|
__builtin_assume(k < WARP_SIZE);
|
1905
2215
|
|
@@ -1909,7 +2219,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1909
2219
|
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
1910
2220
|
|
1911
2221
|
#pragma unroll
|
1912
|
-
for (int i0 = 0; i0 <
|
2222
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
1913
2223
|
int i = i0 + i_offset;
|
1914
2224
|
|
1915
2225
|
if (need_check) {
|
@@ -1942,7 +2252,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
1942
2252
|
const int kbxd = k % blocks_per_tile_x_row;
|
1943
2253
|
|
1944
2254
|
#pragma unroll
|
1945
|
-
for (int i0 = 0; i0 <
|
2255
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
|
1946
2256
|
int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
|
1947
2257
|
|
1948
2258
|
if (need_check) {
|
@@ -1959,13 +2269,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1959
2269
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
1960
2270
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
1961
2271
|
|
1962
|
-
__builtin_assume(i >= 0);
|
1963
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
1964
|
-
__builtin_assume(j >= 0);
|
1965
|
-
__builtin_assume(j < WARP_SIZE);
|
1966
|
-
__builtin_assume(k >= 0);
|
1967
|
-
__builtin_assume(k < WARP_SIZE);
|
1968
|
-
|
1969
2272
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
1970
2273
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
1971
2274
|
|
@@ -1973,12 +2276,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
|
1973
2276
|
|
1974
2277
|
#pragma unroll
|
1975
2278
|
for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
|
1976
|
-
u[2*l+0] = y_qs[j *
|
1977
|
-
u[2*l+1] = y_qs[j *
|
2279
|
+
u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE];
|
2280
|
+
u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
|
1978
2281
|
}
|
1979
2282
|
|
1980
2283
|
return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
|
1981
|
-
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (
|
2284
|
+
(&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
|
1982
2285
|
}
|
1983
2286
|
|
1984
2287
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
@@ -1989,29 +2292,30 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
1989
2292
|
int v[VDR_Q8_0_Q8_1_MMVQ];
|
1990
2293
|
int u[VDR_Q8_0_Q8_1_MMVQ];
|
1991
2294
|
|
2295
|
+
#pragma unroll
|
1992
2296
|
for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
|
1993
2297
|
v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
|
1994
2298
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
1995
2299
|
}
|
1996
2300
|
|
1997
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds);
|
2301
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
1998
2302
|
}
|
1999
2303
|
|
2000
|
-
static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2304
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2001
2305
|
|
2002
|
-
__shared__ int tile_x_qs[
|
2003
|
-
__shared__ float tile_x_d[
|
2306
|
+
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2307
|
+
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
2004
2308
|
|
2005
2309
|
*x_ql = tile_x_qs;
|
2006
2310
|
*x_dm = (half2 *) tile_x_d;
|
2007
2311
|
}
|
2008
2312
|
|
2009
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2313
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2010
2314
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2011
2315
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2012
2316
|
|
2013
2317
|
__builtin_assume(i_offset >= 0);
|
2014
|
-
__builtin_assume(i_offset <
|
2318
|
+
__builtin_assume(i_offset < nwarps);
|
2015
2319
|
__builtin_assume(k >= 0);
|
2016
2320
|
__builtin_assume(k < WARP_SIZE);
|
2017
2321
|
|
@@ -2022,7 +2326,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2022
2326
|
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2023
2327
|
|
2024
2328
|
#pragma unroll
|
2025
|
-
for (int i0 = 0; i0 <
|
2329
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2026
2330
|
int i = i0 + i_offset;
|
2027
2331
|
|
2028
2332
|
if (need_check) {
|
@@ -2032,76 +2336,35 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q8_
|
|
2032
2336
|
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
|
2033
2337
|
|
2034
2338
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
|
2035
|
-
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d;
|
2036
2339
|
}
|
2037
2340
|
|
2038
|
-
|
2039
|
-
|
2341
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
|
2342
|
+
const int kbxd = k % blocks_per_tile_x_row;
|
2040
2343
|
|
2041
|
-
|
2042
|
-
|
2043
|
-
|
2044
|
-
// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2344
|
+
#pragma unroll
|
2345
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
|
2346
|
+
int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
|
2045
2347
|
|
2046
|
-
|
2047
|
-
|
2048
|
-
|
2049
|
-
// }
|
2050
|
-
// #endif // GGML_CUDA_MMQ_Y < 64
|
2348
|
+
if (need_check) {
|
2349
|
+
i = min(i, i_max);
|
2350
|
+
}
|
2051
2351
|
|
2052
|
-
|
2352
|
+
const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
|
2053
2353
|
|
2054
|
-
|
2055
|
-
|
2354
|
+
x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
|
2355
|
+
}
|
2056
2356
|
}
|
2057
2357
|
|
2058
2358
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2059
2359
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2060
2360
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2061
2361
|
|
2062
|
-
|
2063
|
-
|
2064
|
-
__builtin_assume(j >= 0);
|
2065
|
-
__builtin_assume(j < WARP_SIZE);
|
2066
|
-
__builtin_assume(k >= 0);
|
2067
|
-
__builtin_assume(k < WARP_SIZE);
|
2068
|
-
|
2069
|
-
const float * x_dmf = (float *) x_dm;
|
2362
|
+
const float * x_dmf = (const float *) x_dm;
|
2363
|
+
const float * y_df = (const float *) y_ds;
|
2070
2364
|
|
2071
2365
|
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
|
2072
2366
|
(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
|
2073
|
-
|
2074
|
-
}
|
2075
|
-
|
2076
|
-
#define VDR_q2_K_q8_1 1
|
2077
|
-
|
2078
|
-
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl(
|
2079
|
-
const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2080
|
-
const half2 & dm, const float * __restrict__ d8) {
|
2081
|
-
|
2082
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2083
|
-
float sumf_d = 0.0f;
|
2084
|
-
float sumf_m = 0.0f;
|
2085
|
-
|
2086
|
-
for (int i = 0; i < QR2_K; ++i) {
|
2087
|
-
const int sc = scales[2*i];
|
2088
|
-
|
2089
|
-
const int vi = (v >> (2*i)) & 0x03030303;
|
2090
|
-
|
2091
|
-
sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
|
2092
|
-
|
2093
|
-
int sc_high = sc >> 4;
|
2094
|
-
sc_high |= sc_high << 8;
|
2095
|
-
sc_high |= sc_high << 16;
|
2096
|
-
sumf_m += d8[i] * __dp4a(sc_high, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
|
2097
|
-
}
|
2098
|
-
|
2099
|
-
const float2 dmf = __half22float2(dm);
|
2100
|
-
|
2101
|
-
return dmf.x*sumf_d - dmf.y*sumf_m;
|
2102
|
-
#else
|
2103
|
-
return 0.0f; // only to satisfy the compiler
|
2104
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2367
|
+
y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
|
2105
2368
|
}
|
2106
2369
|
|
2107
2370
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
@@ -2115,34 +2378,35 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2115
2378
|
const uint8_t * scales = bq2_K->scales + scale_offset;
|
2116
2379
|
|
2117
2380
|
const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
|
2118
|
-
int
|
2381
|
+
int u[QR2_K];
|
2119
2382
|
float d8[QR2_K];
|
2120
2383
|
|
2384
|
+
#pragma unroll
|
2121
2385
|
for (int i = 0; i < QR2_K; ++ i) {
|
2122
2386
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2123
2387
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2124
2388
|
}
|
2125
2389
|
|
2126
|
-
return
|
2390
|
+
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
2127
2391
|
}
|
2128
2392
|
|
2129
|
-
static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2393
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2130
2394
|
|
2131
|
-
__shared__ int tile_x_ql[
|
2132
|
-
__shared__ half2 tile_x_dm[
|
2133
|
-
__shared__ int tile_x_sc[
|
2395
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2396
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
2397
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2134
2398
|
|
2135
2399
|
*x_ql = tile_x_ql;
|
2136
2400
|
*x_dm = tile_x_dm;
|
2137
2401
|
*x_sc = tile_x_sc;
|
2138
2402
|
}
|
2139
2403
|
|
2140
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2404
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2141
2405
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2142
2406
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2143
2407
|
|
2144
2408
|
__builtin_assume(i_offset >= 0);
|
2145
|
-
__builtin_assume(i_offset <
|
2409
|
+
__builtin_assume(i_offset < nwarps);
|
2146
2410
|
__builtin_assume(k >= 0);
|
2147
2411
|
__builtin_assume(k < WARP_SIZE);
|
2148
2412
|
|
@@ -2152,7 +2416,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2152
2416
|
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2153
2417
|
|
2154
2418
|
#pragma unroll
|
2155
|
-
for (int i0 = 0; i0 <
|
2419
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2156
2420
|
int i = i0 + i_offset;
|
2157
2421
|
|
2158
2422
|
if (need_check) {
|
@@ -2168,8 +2432,8 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2168
2432
|
const int kbxd = k % blocks_per_tile_x_row;
|
2169
2433
|
|
2170
2434
|
#pragma unroll
|
2171
|
-
for (int i0 = 0; i0 <
|
2172
|
-
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) %
|
2435
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
|
2436
|
+
int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
|
2173
2437
|
|
2174
2438
|
if (need_check) {
|
2175
2439
|
i = min(i, i_max);
|
@@ -2181,7 +2445,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q2_
|
|
2181
2445
|
}
|
2182
2446
|
|
2183
2447
|
#pragma unroll
|
2184
|
-
for (int i0 = 0; i0 <
|
2448
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2185
2449
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2186
2450
|
|
2187
2451
|
if (need_check) {
|
@@ -2198,68 +2462,24 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
|
2198
2462
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2199
2463
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2200
2464
|
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
__builtin_assume(j < WARP_SIZE);
|
2205
|
-
__builtin_assume(k >= 0);
|
2206
|
-
__builtin_assume(k < WARP_SIZE);
|
2207
|
-
|
2208
|
-
const int kbx = k / QI2_K;
|
2209
|
-
const int kqsx = k % QI2_K;
|
2465
|
+
const int kbx = k / QI2_K;
|
2466
|
+
const int ky = (k % QI2_K) * QR2_K;
|
2467
|
+
const float * y_df = (const float *) y_ds;
|
2210
2468
|
|
2211
|
-
|
2212
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2469
|
+
int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
|
2213
2470
|
|
2214
|
-
const
|
2471
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
|
2472
|
+
const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
|
2215
2473
|
|
2216
|
-
|
2217
|
-
|
2218
|
-
|
2219
|
-
for (int l = 0; l < QR2_K; ++ l) {
|
2220
|
-
const int y_qs_index = j * (QR2_K*WARP_SIZE) + kbx * (QR2_K*QI2_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2221
|
-
u[l] = y_qs[y_qs_index];
|
2222
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2474
|
+
#pragma unroll
|
2475
|
+
for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
|
2476
|
+
v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2223
2477
|
}
|
2224
2478
|
|
2225
|
-
|
2226
|
-
}
|
2227
|
-
|
2228
|
-
#define VDR_q3_K_q8_1 1
|
2229
|
-
|
2230
|
-
static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl(
|
2231
|
-
const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
|
2232
|
-
const int & scale_offset, const float & d, const float * __restrict__ d8) {
|
2233
|
-
|
2234
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2235
|
-
float sumf = 0.0f;
|
2236
|
-
|
2237
|
-
for (int i = 0; i < QR3_K; ++i) {
|
2238
|
-
const int isc = scale_offset + 2*i;
|
2479
|
+
const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
|
2239
2480
|
|
2240
|
-
|
2241
|
-
|
2242
|
-
const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
|
2243
|
-
|
2244
|
-
const int isc_high = isc % (QK_K/64);
|
2245
|
-
const int sc_shift_high = 2 * (isc / (QK_K/64));
|
2246
|
-
const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
|
2247
|
-
|
2248
|
-
const int sc = (sc_low | sc_high) - 32;
|
2249
|
-
|
2250
|
-
const int vil = (vl >> (2*i)) & 0x03030303;
|
2251
|
-
|
2252
|
-
const int vih = ((vh >> i) << 2) & 0x04040404;
|
2253
|
-
|
2254
|
-
const int vi = __vsubss4(vil, vih);
|
2255
|
-
|
2256
|
-
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2257
|
-
}
|
2258
|
-
|
2259
|
-
return d*sumf;
|
2260
|
-
#else
|
2261
|
-
return 0.0f; // only to satisfy the compiler
|
2262
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2481
|
+
const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
|
2482
|
+
return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
|
2263
2483
|
}
|
2264
2484
|
|
2265
2485
|
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
@@ -2277,23 +2497,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2277
2497
|
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2278
2498
|
const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
|
2279
2499
|
|
2280
|
-
int
|
2500
|
+
int u[QR3_K];
|
2281
2501
|
float d8[QR3_K];
|
2282
2502
|
|
2503
|
+
#pragma unroll
|
2283
2504
|
for (int i = 0; i < QR3_K; ++i) {
|
2284
2505
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2285
2506
|
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
2286
2507
|
}
|
2287
2508
|
|
2288
|
-
return
|
2509
|
+
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
2289
2510
|
}
|
2290
2511
|
|
2291
|
-
static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2512
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2292
2513
|
|
2293
|
-
__shared__ int tile_x_ql[
|
2294
|
-
__shared__ half2 tile_x_dm[
|
2295
|
-
__shared__ int tile_x_qh[
|
2296
|
-
__shared__ int tile_x_sc[
|
2514
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2515
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
|
2516
|
+
__shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2];
|
2517
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4];
|
2297
2518
|
|
2298
2519
|
*x_ql = tile_x_ql;
|
2299
2520
|
*x_dm = tile_x_dm;
|
@@ -2301,12 +2522,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 **
|
|
2301
2522
|
*x_sc = tile_x_sc;
|
2302
2523
|
}
|
2303
2524
|
|
2304
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2525
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
|
2305
2526
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2306
2527
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2307
2528
|
|
2308
2529
|
__builtin_assume(i_offset >= 0);
|
2309
|
-
__builtin_assume(i_offset <
|
2530
|
+
__builtin_assume(i_offset < nwarps);
|
2310
2531
|
__builtin_assume(k >= 0);
|
2311
2532
|
__builtin_assume(k < WARP_SIZE);
|
2312
2533
|
|
@@ -2316,7 +2537,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2316
2537
|
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2317
2538
|
|
2318
2539
|
#pragma unroll
|
2319
|
-
for (int i0 = 0; i0 <
|
2540
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2320
2541
|
int i = i0 + i_offset;
|
2321
2542
|
|
2322
2543
|
if (need_check) {
|
@@ -2330,10 +2551,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2330
2551
|
|
2331
2552
|
const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
|
2332
2553
|
const int kbxd = k % blocks_per_tile_x_row;
|
2554
|
+
float * x_dmf = (float *) x_dm;
|
2333
2555
|
|
2334
2556
|
#pragma unroll
|
2335
|
-
for (int i0 = 0; i0 <
|
2336
|
-
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) %
|
2557
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
|
2558
|
+
int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
|
2337
2559
|
|
2338
2560
|
if (need_check) {
|
2339
2561
|
i = min(i, i_max);
|
@@ -2341,11 +2563,11 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2341
2563
|
|
2342
2564
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2343
2565
|
|
2344
|
-
|
2566
|
+
x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
|
2345
2567
|
}
|
2346
2568
|
|
2347
2569
|
#pragma unroll
|
2348
|
-
for (int i0 = 0; i0 <
|
2570
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
|
2349
2571
|
int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
|
2350
2572
|
|
2351
2573
|
if (need_check) {
|
@@ -2354,11 +2576,12 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2354
2576
|
|
2355
2577
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
|
2356
2578
|
|
2357
|
-
|
2579
|
+
// invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
|
2580
|
+
x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
|
2358
2581
|
}
|
2359
2582
|
|
2360
2583
|
#pragma unroll
|
2361
|
-
for (int i0 = 0; i0 <
|
2584
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
|
2362
2585
|
int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
|
2363
2586
|
|
2364
2587
|
if (need_check) {
|
@@ -2367,7 +2590,19 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q3_
|
|
2367
2590
|
|
2368
2591
|
const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
|
2369
2592
|
|
2370
|
-
|
2593
|
+
const int ksc = k % (QI3_K/4);
|
2594
|
+
|
2595
|
+
const int ksc_low = ksc % (QI3_K/8);
|
2596
|
+
const int shift_low = 4 * (ksc / (QI3_K/8));
|
2597
|
+
const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
|
2598
|
+
|
2599
|
+
const int ksc_high = QI3_K/8;
|
2600
|
+
const int shift_high = 2 * ksc;
|
2601
|
+
const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
|
2602
|
+
|
2603
|
+
const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
|
2604
|
+
|
2605
|
+
x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
|
2371
2606
|
}
|
2372
2607
|
}
|
2373
2608
|
|
@@ -2375,63 +2610,29 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2375
2610
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2611
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2377
2612
|
|
2378
|
-
__builtin_assume(i >= 0);
|
2379
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2380
|
-
__builtin_assume(j >= 0);
|
2381
|
-
__builtin_assume(j < WARP_SIZE);
|
2382
|
-
__builtin_assume(k >= 0);
|
2383
|
-
__builtin_assume(k < WARP_SIZE);
|
2384
|
-
|
2385
2613
|
const int kbx = k / QI3_K;
|
2386
|
-
const int
|
2387
|
-
|
2388
|
-
const
|
2389
|
-
const int scale_offset = kqsx - kqsx % QI8_1 + (kqsx % QI8_1) / (QI8_1/2);
|
2390
|
-
|
2391
|
-
const uint8_t * scales = ((uint8_t *) (x_sc + i * (WARP_SIZE/4) + i / 4)) + kbx*16;
|
2614
|
+
const int ky = (k % QI3_K) * QR3_K;
|
2615
|
+
const float * x_dmf = (const float *) x_dm;
|
2616
|
+
const float * y_df = (const float *) y_ds;
|
2392
2617
|
|
2393
|
-
|
2394
|
-
const int vh = ~x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + kqsx % (QI3_K/2)] >> bq8_offset;
|
2618
|
+
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2395
2619
|
|
2396
|
-
int
|
2397
|
-
float d8[QR3_K];
|
2398
|
-
|
2399
|
-
for (int l = 0; l < QR3_K; ++ l) {
|
2400
|
-
const int y_qs_index = j * (QR3_K*WARP_SIZE) + kbx * (QR3_K*QI3_K) + (bq8_offset + l)*QI8_1 + kqsx % QI8_1;
|
2401
|
-
u[l] = y_qs[y_qs_index];
|
2402
|
-
d8[l] = y_ds[y_qs_index / QI8_1].x;
|
2403
|
-
}
|
2404
|
-
|
2405
|
-
return vec_dot_q3_K_q8_1_impl(x_ql[i * (WARP_SIZE + 1) + k], vh, u, scales, scale_offset,
|
2406
|
-
x_dm[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx].x, d8);
|
2407
|
-
}
|
2408
|
-
|
2409
|
-
#define VDR_q4_K_q8_1 2
|
2410
|
-
|
2411
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl(
|
2412
|
-
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2413
|
-
const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
|
2414
|
-
|
2415
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2416
|
-
float sumf_d = 0.0f;
|
2417
|
-
float sumf_m = 0.0f;
|
2620
|
+
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2418
2621
|
|
2419
|
-
|
2420
|
-
|
2421
|
-
const int
|
2622
|
+
#pragma unroll
|
2623
|
+
for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
|
2624
|
+
const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
|
2625
|
+
const int shift = 2 * ((ky % 32) / 8);
|
2626
|
+
const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
|
2422
2627
|
|
2423
|
-
const int
|
2424
|
-
const int
|
2628
|
+
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
|
2629
|
+
const int vlh = (vh << 2) & 0x04040404;
|
2425
2630
|
|
2426
|
-
|
2427
|
-
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
2631
|
+
v[l] = __vsubss4(vll, vlh);
|
2428
2632
|
}
|
2429
2633
|
|
2430
|
-
|
2431
|
-
|
2432
|
-
#else
|
2433
|
-
return 0.0f; // only to satisfy the compiler
|
2434
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2634
|
+
const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
|
2635
|
+
return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
|
2435
2636
|
}
|
2436
2637
|
|
2437
2638
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
@@ -2478,7 +2679,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2478
2679
|
u[2*i+1] = q8[4];
|
2479
2680
|
}
|
2480
2681
|
|
2481
|
-
return
|
2682
|
+
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
2482
2683
|
|
2483
2684
|
#else
|
2484
2685
|
|
@@ -2527,23 +2728,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2527
2728
|
#endif
|
2528
2729
|
}
|
2529
2730
|
|
2530
|
-
static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2731
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2531
2732
|
|
2532
|
-
__shared__ int tile_x_ql[
|
2533
|
-
__shared__ half2 tile_x_dm[
|
2534
|
-
__shared__ int tile_x_sc[
|
2733
|
+
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2734
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
2735
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2535
2736
|
|
2536
2737
|
*x_ql = tile_x_ql;
|
2537
2738
|
*x_dm = tile_x_dm;
|
2538
2739
|
*x_sc = tile_x_sc;
|
2539
2740
|
}
|
2540
2741
|
|
2541
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2742
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
2542
2743
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2543
2744
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2544
2745
|
|
2545
2746
|
__builtin_assume(i_offset >= 0);
|
2546
|
-
__builtin_assume(i_offset <
|
2747
|
+
__builtin_assume(i_offset < nwarps);
|
2547
2748
|
__builtin_assume(k >= 0);
|
2548
2749
|
__builtin_assume(k < WARP_SIZE);
|
2549
2750
|
|
@@ -2553,7 +2754,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2553
2754
|
const block_q4_K * bx0 = (block_q4_K *) vx;
|
2554
2755
|
|
2555
2756
|
#pragma unroll
|
2556
|
-
for (int i0 = 0; i0 <
|
2757
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2557
2758
|
int i = i0 + i_offset;
|
2558
2759
|
|
2559
2760
|
if (need_check) {
|
@@ -2563,118 +2764,62 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q4_
|
|
2563
2764
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2564
2765
|
|
2565
2766
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2566
|
-
}
|
2567
|
-
|
2568
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2569
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2570
|
-
|
2571
|
-
#pragma unroll
|
2572
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) {
|
2573
|
-
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
2574
|
-
|
2575
|
-
if (need_check) {
|
2576
|
-
i = min(i, i_max);
|
2577
|
-
}
|
2578
|
-
|
2579
|
-
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2580
|
-
|
2581
|
-
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2582
|
-
}
|
2583
|
-
|
2584
|
-
#pragma unroll
|
2585
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2586
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2587
|
-
|
2588
|
-
if (need_check) {
|
2589
|
-
i = min(i, i_max);
|
2590
|
-
}
|
2591
|
-
|
2592
|
-
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2593
|
-
|
2594
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_uint8_aligned(bxi->scales, k % (QI4_K/8));
|
2595
|
-
}
|
2596
|
-
}
|
2597
|
-
|
2598
|
-
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2599
|
-
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2600
|
-
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2601
|
-
|
2602
|
-
__builtin_assume(i >= 0);
|
2603
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2604
|
-
__builtin_assume(j >= 0);
|
2605
|
-
__builtin_assume(j < WARP_SIZE);
|
2606
|
-
__builtin_assume(k >= 0);
|
2607
|
-
__builtin_assume(k < WARP_SIZE);
|
2608
|
-
|
2609
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2610
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2611
|
-
|
2612
|
-
int v[2];
|
2613
|
-
int u[2*QR4_K];
|
2614
|
-
float d8[QR4_K];
|
2767
|
+
}
|
2615
2768
|
|
2616
|
-
|
2617
|
-
const int
|
2769
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
2770
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2618
2771
|
|
2619
|
-
|
2620
|
-
|
2772
|
+
#pragma unroll
|
2773
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
|
2774
|
+
int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
|
2621
2775
|
|
2622
|
-
|
2623
|
-
|
2624
|
-
|
2625
|
-
|
2626
|
-
|
2627
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2628
|
-
} else {
|
2629
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2630
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2631
|
-
}
|
2632
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2633
|
-
const uint8_t * m = sc + 2;
|
2776
|
+
if (need_check) {
|
2777
|
+
i = min(i, i_max);
|
2778
|
+
}
|
2779
|
+
|
2780
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2634
2781
|
|
2635
|
-
|
2636
|
-
const int kqsy = j * (QR4_K*WARP_SIZE) + kbx * (QR4_K*QI4_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2637
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2638
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2639
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2782
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2640
2783
|
}
|
2641
2784
|
|
2642
|
-
|
2643
|
-
|
2785
|
+
#pragma unroll
|
2786
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2787
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2644
2788
|
|
2645
|
-
|
2789
|
+
if (need_check) {
|
2790
|
+
i = min(i, i_max);
|
2791
|
+
}
|
2646
2792
|
|
2647
|
-
|
2648
|
-
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
2649
|
-
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
2793
|
+
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
2650
2794
|
|
2651
|
-
|
2652
|
-
float sumf_d = 0.0f;
|
2653
|
-
float sumf_m = 0.0f;
|
2795
|
+
const int * scales = (int *) bxi->scales;
|
2654
2796
|
|
2655
|
-
|
2656
|
-
const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
|
2657
|
-
const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
|
2797
|
+
const int ksc = k % (WARP_SIZE/8);
|
2658
2798
|
|
2659
|
-
|
2660
|
-
|
2799
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2800
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2801
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2661
2802
|
|
2662
|
-
|
2663
|
-
|
2803
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2804
|
+
}
|
2805
|
+
}
|
2664
2806
|
|
2665
|
-
|
2666
|
-
|
2807
|
+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
2808
|
+
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
|
+
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2667
2810
|
|
2668
|
-
|
2669
|
-
sumf_m += d8[i] * (dot2 * m[i]);
|
2811
|
+
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2670
2812
|
|
2813
|
+
#pragma unroll
|
2814
|
+
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
+
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
+
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2671
2817
|
}
|
2672
2818
|
|
2673
|
-
|
2819
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2674
2820
|
|
2675
|
-
|
2676
|
-
return
|
2677
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2821
|
+
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
+
return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2678
2823
|
}
|
2679
2824
|
|
2680
2825
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2711,6 +2856,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2711
2856
|
const uint8_t * sc = (const uint8_t *)aux;
|
2712
2857
|
const uint8_t * m = sc + 2;
|
2713
2858
|
|
2859
|
+
#pragma unroll
|
2714
2860
|
for (int i = 0; i < QR5_K; ++i) {
|
2715
2861
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2716
2862
|
d8[i] = bq8i->ds.x;
|
@@ -2765,25 +2911,23 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2765
2911
|
#endif
|
2766
2912
|
}
|
2767
2913
|
|
2768
|
-
static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2914
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2769
2915
|
|
2770
|
-
__shared__ int tile_x_ql[
|
2771
|
-
__shared__ half2 tile_x_dm[
|
2772
|
-
__shared__ int
|
2773
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
2916
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2917
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
2918
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2774
2919
|
|
2775
2920
|
*x_ql = tile_x_ql;
|
2776
2921
|
*x_dm = tile_x_dm;
|
2777
|
-
*x_qh = tile_x_qh;
|
2778
2922
|
*x_sc = tile_x_sc;
|
2779
2923
|
}
|
2780
2924
|
|
2781
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2925
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
2782
2926
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2783
2927
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
2928
|
|
2785
2929
|
__builtin_assume(i_offset >= 0);
|
2786
|
-
__builtin_assume(i_offset <
|
2930
|
+
__builtin_assume(i_offset < nwarps);
|
2787
2931
|
__builtin_assume(k >= 0);
|
2788
2932
|
__builtin_assume(k < WARP_SIZE);
|
2789
2933
|
|
@@ -2793,7 +2937,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2793
2937
|
const block_q5_K * bx0 = (block_q5_K *) vx;
|
2794
2938
|
|
2795
2939
|
#pragma unroll
|
2796
|
-
for (int i0 = 0; i0 <
|
2940
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2797
2941
|
int i = i0 + i_offset;
|
2798
2942
|
|
2799
2943
|
if (need_check) {
|
@@ -2801,16 +2945,29 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2801
2945
|
}
|
2802
2946
|
|
2803
2947
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
|
2948
|
+
const int ky = QR5_K*kqsx;
|
2804
2949
|
|
2805
|
-
|
2950
|
+
const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
2951
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
2952
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2953
|
+
|
2954
|
+
const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
|
2955
|
+
const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
|
2956
|
+
const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
|
2957
|
+
|
2958
|
+
const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
|
2959
|
+
const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
|
2960
|
+
|
2961
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
|
2962
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
2806
2963
|
}
|
2807
2964
|
|
2808
2965
|
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
2809
|
-
const int kbxd = k % blocks_per_tile_x_row;
|
2966
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2810
2967
|
|
2811
2968
|
#pragma unroll
|
2812
|
-
for (int i0 = 0; i0 <
|
2813
|
-
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) %
|
2969
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
|
2970
|
+
int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
|
2814
2971
|
|
2815
2972
|
if (need_check) {
|
2816
2973
|
i = min(i, i_max);
|
@@ -2822,29 +2979,24 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q5_
|
|
2822
2979
|
}
|
2823
2980
|
|
2824
2981
|
#pragma unroll
|
2825
|
-
for (int i0 = 0; i0 <
|
2826
|
-
int i = i0 + i_offset *
|
2982
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
2983
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
2827
2984
|
|
2828
2985
|
if (need_check) {
|
2829
2986
|
i = min(i, i_max);
|
2830
2987
|
}
|
2831
2988
|
|
2832
|
-
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/
|
2833
|
-
|
2834
|
-
x_qh[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8(bxi->qh, k % (QI5_K/4));
|
2835
|
-
}
|
2989
|
+
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
2836
2990
|
|
2837
|
-
|
2838
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) {
|
2839
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y;
|
2991
|
+
const int * scales = (int *) bxi->scales;
|
2840
2992
|
|
2841
|
-
|
2842
|
-
i = min(i, i_max);
|
2843
|
-
}
|
2993
|
+
const int ksc = k % (WARP_SIZE/8);
|
2844
2994
|
|
2845
|
-
|
2995
|
+
// scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
|
2996
|
+
int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
|
2997
|
+
scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits
|
2846
2998
|
|
2847
|
-
x_sc[i * (WARP_SIZE/8) + i / 8 +
|
2999
|
+
x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
|
2848
3000
|
}
|
2849
3001
|
}
|
2850
3002
|
|
@@ -2852,77 +3004,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
2852
3004
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2853
3005
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2854
3006
|
|
2855
|
-
|
2856
|
-
__builtin_assume(i < GGML_CUDA_MMQ_Y);
|
2857
|
-
__builtin_assume(j >= 0);
|
2858
|
-
__builtin_assume(j < WARP_SIZE);
|
2859
|
-
__builtin_assume(k >= 0);
|
2860
|
-
__builtin_assume(k < WARP_SIZE);
|
2861
|
-
|
2862
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
2863
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
2864
|
-
|
2865
|
-
int vl[2];
|
2866
|
-
int vh[2];
|
2867
|
-
int u[2*QR4_K];
|
2868
|
-
float d8[QR4_K];
|
2869
|
-
|
2870
|
-
const int bq8_offset = QR5_K * ((kqsx/2) / (QI8_1/2));
|
2871
|
-
|
2872
|
-
vl[0] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 0];
|
2873
|
-
vl[1] = x_ql[i * (WARP_SIZE + 1) + 4 * bq8_offset + (kqsx/2) % 4 + 4];
|
2874
|
-
|
2875
|
-
vh[0] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 0] >> bq8_offset;
|
2876
|
-
vh[1] = x_qh[i * (WARP_SIZE/4) + i/4 + (kqsx/2) % 4 + 4] >> bq8_offset;
|
2877
|
-
|
2878
|
-
const uint16_t * scales = (const uint16_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + kbx * 4];
|
2879
|
-
uint16_t aux[2];
|
2880
|
-
const int l = bq8_offset/2;
|
2881
|
-
if (l < 2) {
|
2882
|
-
aux[0] = scales[l+0] & 0x3f3f;
|
2883
|
-
aux[1] = scales[l+2] & 0x3f3f;
|
2884
|
-
} else {
|
2885
|
-
aux[0] = ((scales[l+2] >> 0) & 0x0f0f) | ((scales[l-2] & 0xc0c0) >> 2);
|
2886
|
-
aux[1] = ((scales[l+2] >> 4) & 0x0f0f) | ((scales[l-0] & 0xc0c0) >> 2);
|
2887
|
-
}
|
2888
|
-
const uint8_t * sc = (const uint8_t *)aux;
|
2889
|
-
const uint8_t * m = sc + 2;
|
2890
|
-
|
2891
|
-
for (int l = 0; l < QR5_K; ++l) {
|
2892
|
-
const int kqsy = j * (QR5_K*WARP_SIZE) + kbx * (QR5_K*QI5_K) + (bq8_offset + l) * QI8_1 + (kqsx/2) % (QI8_1/2);
|
2893
|
-
u[2*l+0] = y_qs[kqsy + 0*(QI8_1/2)];
|
2894
|
-
u[2*l+1] = y_qs[kqsy + 1*(QI8_1/2)];
|
2895
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
2896
|
-
}
|
2897
|
-
|
2898
|
-
return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K + kbx], d8);
|
2899
|
-
}
|
2900
|
-
|
2901
|
-
#define VDR_q6_K_q8_1 1
|
2902
|
-
|
2903
|
-
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl(
|
2904
|
-
const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
|
2905
|
-
const float & d, const float * __restrict__ d8) {
|
2906
|
-
|
2907
|
-
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
2908
|
-
float sumf = 0.0f;
|
2909
|
-
|
2910
|
-
for (int i = 0; i < QR6_K; ++i) {
|
2911
|
-
const int sc = scales[4*i];
|
2912
|
-
|
2913
|
-
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
2914
|
-
|
2915
|
-
const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
|
2916
|
-
|
2917
|
-
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
|
2918
|
-
|
2919
|
-
sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
|
2920
|
-
}
|
3007
|
+
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
2921
3008
|
|
2922
|
-
|
2923
|
-
|
2924
|
-
return
|
2925
|
-
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
3009
|
+
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
|
+
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
2926
3012
|
}
|
2927
3013
|
|
2928
3014
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -2942,33 +3028,32 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
2942
3028
|
int u[QR6_K];
|
2943
3029
|
float d8[QR6_K];
|
2944
3030
|
|
3031
|
+
#pragma unroll
|
2945
3032
|
for (int i = 0; i < QR6_K; ++i) {
|
2946
3033
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
2947
3034
|
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
2948
3035
|
}
|
2949
3036
|
|
2950
|
-
return
|
3037
|
+
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
2951
3038
|
}
|
2952
3039
|
|
2953
|
-
static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3040
|
+
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2954
3041
|
|
2955
|
-
__shared__ int tile_x_ql[
|
2956
|
-
__shared__ half2 tile_x_dm[
|
2957
|
-
__shared__ int
|
2958
|
-
__shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8];
|
3042
|
+
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3043
|
+
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
3044
|
+
__shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8];
|
2959
3045
|
|
2960
3046
|
*x_ql = tile_x_ql;
|
2961
3047
|
*x_dm = tile_x_dm;
|
2962
|
-
*x_qh = tile_x_qh;
|
2963
3048
|
*x_sc = tile_x_sc;
|
2964
3049
|
}
|
2965
3050
|
|
2966
|
-
template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3051
|
+
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
2967
3052
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2968
3053
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2969
3054
|
|
2970
3055
|
__builtin_assume(i_offset >= 0);
|
2971
|
-
__builtin_assume(i_offset <
|
3056
|
+
__builtin_assume(i_offset < nwarps);
|
2972
3057
|
__builtin_assume(k >= 0);
|
2973
3058
|
__builtin_assume(k < WARP_SIZE);
|
2974
3059
|
|
@@ -2978,7 +3063,7 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2978
3063
|
const block_q6_K * bx0 = (block_q6_K *) vx;
|
2979
3064
|
|
2980
3065
|
#pragma unroll
|
2981
|
-
for (int i0 = 0; i0 <
|
3066
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
2982
3067
|
int i = i0 + i_offset;
|
2983
3068
|
|
2984
3069
|
if (need_check) {
|
@@ -2986,42 +3071,43 @@ template <bool need_check> static __device__ __forceinline__ void load_tiles_q6_
|
|
2986
3071
|
}
|
2987
3072
|
|
2988
3073
|
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
|
3074
|
+
const int ky = QR6_K*kqsx;
|
2989
3075
|
|
2990
|
-
|
2991
|
-
|
2992
|
-
|
2993
|
-
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
2994
|
-
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
2995
|
-
|
2996
|
-
#pragma unroll
|
2997
|
-
for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) {
|
2998
|
-
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y;
|
3076
|
+
const int ql = get_int_from_uint8(bxi->ql, kqsx);
|
3077
|
+
const int ql0 = (ql >> 0) & 0x0F0F0F0F;
|
3078
|
+
const int ql1 = (ql >> 4) & 0x0F0F0F0F;
|
2999
3079
|
|
3000
|
-
|
3001
|
-
|
3002
|
-
|
3080
|
+
const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
|
3081
|
+
const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
|
3082
|
+
const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030;
|
3003
3083
|
|
3004
|
-
const
|
3084
|
+
const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
|
3085
|
+
const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
|
3005
3086
|
|
3006
|
-
|
3087
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
|
3088
|
+
x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
|
3007
3089
|
}
|
3008
3090
|
|
3091
|
+
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
3092
|
+
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
3093
|
+
float * x_dmf = (float *) x_dm;
|
3094
|
+
|
3009
3095
|
#pragma unroll
|
3010
|
-
for (int i0 = 0; i0 <
|
3011
|
-
int i = i0 + i_offset *
|
3096
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
|
3097
|
+
int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
|
3012
3098
|
|
3013
3099
|
if (need_check) {
|
3014
3100
|
i = min(i, i_max);
|
3015
3101
|
}
|
3016
3102
|
|
3017
|
-
const block_q6_K * bxi = bx0 + i*blocks_per_row +
|
3103
|
+
const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3018
3104
|
|
3019
|
-
|
3105
|
+
x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
|
3020
3106
|
}
|
3021
3107
|
|
3022
3108
|
#pragma unroll
|
3023
|
-
for (int i0 = 0; i0 <
|
3024
|
-
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) %
|
3109
|
+
for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
|
3110
|
+
int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
|
3025
3111
|
|
3026
3112
|
if (need_check) {
|
3027
3113
|
i = min(i, i_max);
|
@@ -3037,39 +3123,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3037
3123
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3038
3124
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3039
3125
|
|
3040
|
-
|
3041
|
-
|
3042
|
-
__builtin_assume(j >= 0);
|
3043
|
-
__builtin_assume(j < WARP_SIZE);
|
3044
|
-
__builtin_assume(k >= 0);
|
3045
|
-
__builtin_assume(k < WARP_SIZE);
|
3046
|
-
|
3047
|
-
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3048
|
-
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3049
|
-
|
3050
|
-
const int bq8_offset = 2 * QR6_K * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/4);
|
3051
|
-
const int scale_offset = (QI6_K/4) * (kqsx / (QI6_K/2)) + (kqsx % (QI6_K/2)) / (QI6_K/8);
|
3052
|
-
const int vh_shift = 2 * ((kqsx % (QI6_K/2)) / (QI6_K/4));
|
3053
|
-
|
3054
|
-
const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI6_K/2) + (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)] >> vh_shift;
|
3055
|
-
|
3056
|
-
const int x_sc_offset = i * (WARP_SIZE/8) + i/8 + kbx * (QI6_K/8);
|
3057
|
-
const int8_t * scales = ((int8_t *) (x_sc + x_sc_offset)) + scale_offset;
|
3126
|
+
const float * x_dmf = (const float *) x_dm;
|
3127
|
+
const float * y_df = (const float *) y_ds;
|
3058
3128
|
|
3059
|
-
|
3060
|
-
float d8[QR6_K];
|
3061
|
-
|
3062
|
-
for (int l = 0; l < QR6_K; ++l) {
|
3063
|
-
const int kqsy = j * (QR6_K*WARP_SIZE) + kbx * (QR6_K*QI6_K) + (bq8_offset + 2*l)*QI8_1 + kqsx % QI8_1;
|
3064
|
-
u[l] = y_qs[kqsy];
|
3065
|
-
d8[l] = y_ds[kqsy / QI8_1].x;
|
3066
|
-
}
|
3129
|
+
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
|
3067
3130
|
|
3068
|
-
|
3069
|
-
|
3131
|
+
const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k;
|
3132
|
+
const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE;
|
3133
|
+
return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
|
3070
3134
|
}
|
3071
3135
|
|
3072
|
-
template <int qk, int qr, int qi, typename block_q_t,
|
3136
|
+
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3073
3137
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3074
3138
|
static __global__ void mul_mat_q(
|
3075
3139
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
@@ -3084,14 +3148,11 @@ static __global__ void mul_mat_q(
|
|
3084
3148
|
|
3085
3149
|
const int & ncols_dst = ncols_y;
|
3086
3150
|
|
3087
|
-
const int
|
3088
|
-
const int tid_y = threadIdx.y;
|
3089
|
-
|
3090
|
-
const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y;
|
3151
|
+
const int row_dst_0 = blockIdx.x*mmq_y;
|
3091
3152
|
const int & row_x_0 = row_dst_0;
|
3092
|
-
const int row_dst = row_dst_0 +
|
3153
|
+
const int row_dst = row_dst_0 + threadIdx.x;
|
3093
3154
|
|
3094
|
-
const int col_dst_0 = blockIdx.y*
|
3155
|
+
const int col_dst_0 = blockIdx.y*mmq_x;
|
3095
3156
|
const int & col_y_0 = col_dst_0;
|
3096
3157
|
|
3097
3158
|
int * tile_x_ql = nullptr;
|
@@ -3101,55 +3162,65 @@ static __global__ void mul_mat_q(
|
|
3101
3162
|
|
3102
3163
|
allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
|
3103
3164
|
|
3104
|
-
|
3105
|
-
|
3106
|
-
__shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)];
|
3107
|
-
__shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col];
|
3165
|
+
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3166
|
+
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3108
3167
|
|
3109
|
-
float sum[
|
3168
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3110
3169
|
|
3111
3170
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3112
3171
|
|
3113
3172
|
load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
|
3114
|
-
|
3173
|
+
threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
|
3115
3174
|
|
3175
|
+
#pragma unroll
|
3116
3176
|
for (int ir = 0; ir < qr; ++ir) {
|
3117
|
-
const int kqs = ir*WARP_SIZE +
|
3177
|
+
const int kqs = ir*WARP_SIZE + threadIdx.x;
|
3118
3178
|
const int kbxd = kqs / QI8_1;
|
3119
3179
|
|
3120
|
-
|
3121
|
-
|
3180
|
+
#pragma unroll
|
3181
|
+
for (int i = 0; i < mmq_x; i += nwarps) {
|
3182
|
+
const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
|
3122
3183
|
|
3123
3184
|
const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
|
3124
3185
|
|
3125
|
-
|
3186
|
+
const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
|
3187
|
+
tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
|
3126
3188
|
}
|
3127
|
-
}
|
3128
3189
|
|
3129
|
-
|
3130
|
-
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
3190
|
+
#pragma unroll
|
3191
|
+
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
3192
|
+
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
|
3193
|
+
const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
|
3194
|
+
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
3195
|
+
|
3196
|
+
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
3197
|
+
const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
|
3198
|
+
half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
|
3199
|
+
if (need_sum) {
|
3200
|
+
*dsi_dst = *dsi_src;
|
3201
|
+
} else {
|
3202
|
+
float * dfi_dst = (float *) dsi_dst;
|
3203
|
+
*dfi_dst = (*dsi_src).x;
|
3204
|
+
}
|
3205
|
+
}
|
3135
3206
|
|
3136
|
-
|
3207
|
+
__syncthreads();
|
3137
3208
|
|
3138
|
-
#
|
3139
|
-
|
3140
|
-
#endif // __CUDA_ARCH__ >= 700
|
3141
|
-
for (int k = 0; k < WARP_SIZE; k += vdr) {
|
3209
|
+
// #pragma unroll // unrolling this loop causes too much register pressure
|
3210
|
+
for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
|
3142
3211
|
#pragma unroll
|
3143
|
-
|
3212
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3144
3213
|
#pragma unroll
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3214
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3215
|
+
sum[i/WARP_SIZE][j/nwarps] += vec_dot(
|
3216
|
+
tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
|
3217
|
+
threadIdx.x + i, threadIdx.y + j, k);
|
3218
|
+
}
|
3148
3219
|
}
|
3149
3220
|
}
|
3150
|
-
}
|
3151
3221
|
|
3152
|
-
|
3222
|
+
__syncthreads();
|
3223
|
+
}
|
3153
3224
|
}
|
3154
3225
|
|
3155
3226
|
|
@@ -3157,15 +3228,15 @@ static __global__ void mul_mat_q(
|
|
3157
3228
|
return;
|
3158
3229
|
}
|
3159
3230
|
|
3160
|
-
for (int j = 0; j <
|
3161
|
-
const int col_dst = col_dst_0 + j +
|
3231
|
+
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
|
+
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3162
3233
|
|
3163
3234
|
if (col_dst >= ncols_dst) {
|
3164
3235
|
return;
|
3165
3236
|
}
|
3166
3237
|
|
3167
|
-
for (int i = 0; i <
|
3168
|
-
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/
|
3238
|
+
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
+
dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/nwarps];
|
3169
3240
|
}
|
3170
3241
|
}
|
3171
3242
|
}
|
@@ -3780,7 +3851,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3780
3851
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3781
3852
|
const dim3 block_nums(1, block_num_y, 1);
|
3782
3853
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3783
|
-
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
3854
|
+
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
3784
3855
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3785
3856
|
}
|
3786
3857
|
|
@@ -3789,7 +3860,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3789
3860
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3790
3861
|
const dim3 block_nums(1, block_num_y, 1);
|
3791
3862
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3792
|
-
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
3863
|
+
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
3793
3864
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3794
3865
|
}
|
3795
3866
|
|
@@ -3798,7 +3869,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3798
3869
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3799
3870
|
const dim3 block_nums(1, block_num_y, 1);
|
3800
3871
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3801
|
-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
3872
|
+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
3802
3873
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3803
3874
|
}
|
3804
3875
|
|
@@ -3807,7 +3878,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3807
3878
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3808
3879
|
const dim3 block_nums(1, block_num_y, 1);
|
3809
3880
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3810
|
-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
3881
|
+
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
3811
3882
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3812
3883
|
}
|
3813
3884
|
|
@@ -3816,7 +3887,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
3816
3887
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
3817
3888
|
const dim3 block_nums(1, block_num_y, 1);
|
3818
3889
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
3819
|
-
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
3890
|
+
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
3820
3891
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
3821
3892
|
}
|
3822
3893
|
|
@@ -3867,17 +3938,52 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3867
3938
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3868
3939
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3869
3940
|
|
3870
|
-
|
3871
|
-
|
3872
|
-
const
|
3873
|
-
|
3874
|
-
|
3875
|
-
|
3876
|
-
|
3877
|
-
|
3941
|
+
int id;
|
3942
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3943
|
+
const int compute_capability = g_compute_capabilities[id];
|
3944
|
+
|
3945
|
+
if (compute_capability >= CC_TURING) {
|
3946
|
+
const int mmq_x = 64;
|
3947
|
+
const int mmq_y = 128;
|
3948
|
+
const int nwarps = 4;
|
3949
|
+
|
3950
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3951
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3952
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3953
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
+
|
3955
|
+
if (nrows_x % mmq_y == 0) {
|
3956
|
+
const bool need_check = false;
|
3957
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
+
} else {
|
3961
|
+
const bool need_check = true;
|
3962
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
+
}
|
3878
3966
|
} else {
|
3879
|
-
|
3880
|
-
|
3967
|
+
const int mmq_x = 64;
|
3968
|
+
const int mmq_y = 64;
|
3969
|
+
const int nwarps = 4;
|
3970
|
+
|
3971
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
3972
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
3973
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
3974
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3975
|
+
|
3976
|
+
if (nrows_x % mmq_y == 0) {
|
3977
|
+
const bool need_check = false;
|
3978
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3979
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3980
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3981
|
+
} else {
|
3982
|
+
const bool need_check = true;
|
3983
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
+
}
|
3881
3987
|
}
|
3882
3988
|
}
|
3883
3989
|
|
@@ -3885,17 +3991,53 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3885
3991
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3886
3992
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3887
3993
|
|
3888
|
-
|
3889
|
-
|
3890
|
-
const
|
3891
|
-
|
3892
|
-
|
3893
|
-
|
3894
|
-
|
3895
|
-
|
3994
|
+
int id;
|
3995
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
3996
|
+
const int compute_capability = g_compute_capabilities[id];
|
3997
|
+
|
3998
|
+
if (compute_capability >= CC_TURING) {
|
3999
|
+
const int mmq_x = 64;
|
4000
|
+
const int mmq_y = 128;
|
4001
|
+
const int nwarps = 4;
|
4002
|
+
|
4003
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4004
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4005
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4006
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
+
|
4008
|
+
if (nrows_x % mmq_y == 0) {
|
4009
|
+
const bool need_check = false;
|
4010
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
+
} else {
|
4014
|
+
const bool need_check = true;
|
4015
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
+
}
|
3896
4019
|
} else {
|
3897
|
-
|
3898
|
-
|
4020
|
+
const int mmq_x = 64;
|
4021
|
+
const int mmq_y = 64;
|
4022
|
+
const int nwarps = 8;
|
4023
|
+
|
4024
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
+
|
4029
|
+
if (nrows_x % mmq_y == 0) {
|
4030
|
+
const bool need_check = false;
|
4031
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
+
} else {
|
4035
|
+
const bool need_check = true;
|
4036
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
+
}
|
4040
|
+
|
3899
4041
|
}
|
3900
4042
|
}
|
3901
4043
|
|
@@ -3903,17 +4045,52 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
3903
4045
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3904
4046
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3905
4047
|
|
3906
|
-
|
3907
|
-
|
3908
|
-
const
|
3909
|
-
|
3910
|
-
|
3911
|
-
|
3912
|
-
|
3913
|
-
|
4048
|
+
int id;
|
4049
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4050
|
+
const int compute_capability = g_compute_capabilities[id];
|
4051
|
+
|
4052
|
+
if (compute_capability >= CC_TURING) {
|
4053
|
+
const int mmq_x = 128;
|
4054
|
+
const int mmq_y = 64;
|
4055
|
+
const int nwarps = 4;
|
4056
|
+
|
4057
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4058
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4059
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4060
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
+
|
4062
|
+
if (nrows_x % mmq_y == 0) {
|
4063
|
+
const bool need_check = false;
|
4064
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
+
} else {
|
4068
|
+
const bool need_check = true;
|
4069
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
+
}
|
3914
4073
|
} else {
|
3915
|
-
|
3916
|
-
|
4074
|
+
const int mmq_x = 64;
|
4075
|
+
const int mmq_y = 64;
|
4076
|
+
const int nwarps = 8;
|
4077
|
+
|
4078
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4079
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4080
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4081
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4082
|
+
|
4083
|
+
if (nrows_x % mmq_y == 0) {
|
4084
|
+
const bool need_check = false;
|
4085
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4086
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4087
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4088
|
+
} else {
|
4089
|
+
const bool need_check = true;
|
4090
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
+
}
|
3917
4094
|
}
|
3918
4095
|
}
|
3919
4096
|
|
@@ -3921,17 +4098,52 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
3921
4098
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3922
4099
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3923
4100
|
|
3924
|
-
|
3925
|
-
|
3926
|
-
const
|
3927
|
-
|
3928
|
-
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
4101
|
+
int id;
|
4102
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4103
|
+
const int compute_capability = g_compute_capabilities[id];
|
4104
|
+
|
4105
|
+
if (compute_capability >= CC_TURING) {
|
4106
|
+
const int mmq_x = 128;
|
4107
|
+
const int mmq_y = 64;
|
4108
|
+
const int nwarps = 8;
|
4109
|
+
|
4110
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4111
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4112
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4113
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
+
|
4115
|
+
if (nrows_x % mmq_y == 0) {
|
4116
|
+
const bool need_check = false;
|
4117
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
+
} else {
|
4121
|
+
const bool need_check = true;
|
4122
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
+
}
|
3932
4126
|
} else {
|
3933
|
-
|
3934
|
-
|
4127
|
+
const int mmq_x = 64;
|
4128
|
+
const int mmq_y = 64;
|
4129
|
+
const int nwarps = 8;
|
4130
|
+
|
4131
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4132
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4133
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4134
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4135
|
+
|
4136
|
+
if (nrows_x % mmq_y == 0) {
|
4137
|
+
const bool need_check = false;
|
4138
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4139
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4140
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4141
|
+
} else {
|
4142
|
+
const bool need_check = true;
|
4143
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
+
}
|
3935
4147
|
}
|
3936
4148
|
}
|
3937
4149
|
|
@@ -3939,17 +4151,52 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
3939
4151
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3940
4152
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3941
4153
|
|
3942
|
-
|
3943
|
-
|
3944
|
-
const
|
3945
|
-
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
4154
|
+
int id;
|
4155
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4156
|
+
const int compute_capability = g_compute_capabilities[id];
|
4157
|
+
|
4158
|
+
if (compute_capability >= CC_TURING) {
|
4159
|
+
const int mmq_x = 128;
|
4160
|
+
const int mmq_y = 64;
|
4161
|
+
const int nwarps = 4;
|
4162
|
+
|
4163
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4164
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4165
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4166
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
+
|
4168
|
+
if (nrows_x % mmq_y == 0) {
|
4169
|
+
const bool need_check = false;
|
4170
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
+
} else {
|
4174
|
+
const bool need_check = true;
|
4175
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
+
}
|
3950
4179
|
} else {
|
3951
|
-
|
3952
|
-
|
4180
|
+
const int mmq_x = 64;
|
4181
|
+
const int mmq_y = 64;
|
4182
|
+
const int nwarps = 8;
|
4183
|
+
|
4184
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4185
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4186
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4187
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4188
|
+
|
4189
|
+
if (nrows_x % mmq_y == 0) {
|
4190
|
+
const bool need_check = false;
|
4191
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4192
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4193
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4194
|
+
} else {
|
4195
|
+
const bool need_check = true;
|
4196
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
+
}
|
3953
4200
|
}
|
3954
4201
|
}
|
3955
4202
|
|
@@ -3957,17 +4204,52 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
3957
4204
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3958
4205
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3959
4206
|
|
3960
|
-
|
3961
|
-
|
3962
|
-
const
|
3963
|
-
|
3964
|
-
|
3965
|
-
|
3966
|
-
|
3967
|
-
|
4207
|
+
int id;
|
4208
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4209
|
+
const int compute_capability = g_compute_capabilities[id];
|
4210
|
+
|
4211
|
+
if (compute_capability >= CC_TURING) {
|
4212
|
+
const int mmq_x = 64;
|
4213
|
+
const int mmq_y = 128;
|
4214
|
+
const int nwarps = 4;
|
4215
|
+
|
4216
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4217
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4218
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4219
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
+
|
4221
|
+
if (nrows_x % mmq_y == 0) {
|
4222
|
+
const bool need_check = false;
|
4223
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
+
} else {
|
4227
|
+
const bool need_check = true;
|
4228
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
+
}
|
3968
4232
|
} else {
|
3969
|
-
|
3970
|
-
|
4233
|
+
const int mmq_x = 64;
|
4234
|
+
const int mmq_y = 64;
|
4235
|
+
const int nwarps = 8;
|
4236
|
+
|
4237
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4238
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4239
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4240
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4241
|
+
|
4242
|
+
if (nrows_x % mmq_y == 0) {
|
4243
|
+
const bool need_check = false;
|
4244
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4245
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4246
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4247
|
+
} else {
|
4248
|
+
const bool need_check = true;
|
4249
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
+
}
|
3971
4253
|
}
|
3972
4254
|
}
|
3973
4255
|
|
@@ -3975,17 +4257,52 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
3975
4257
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3976
4258
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3977
4259
|
|
3978
|
-
|
3979
|
-
|
3980
|
-
const
|
3981
|
-
|
3982
|
-
|
3983
|
-
|
3984
|
-
|
3985
|
-
|
4260
|
+
int id;
|
4261
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4262
|
+
const int compute_capability = g_compute_capabilities[id];
|
4263
|
+
|
4264
|
+
if (compute_capability >= CC_TURING) {
|
4265
|
+
const int mmq_x = 128;
|
4266
|
+
const int mmq_y = 128;
|
4267
|
+
const int nwarps = 4;
|
4268
|
+
|
4269
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4270
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4271
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4272
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
+
|
4274
|
+
if (nrows_x % mmq_y == 0) {
|
4275
|
+
const bool need_check = false;
|
4276
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
+
} else {
|
4280
|
+
const bool need_check = true;
|
4281
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
+
}
|
3986
4285
|
} else {
|
3987
|
-
|
3988
|
-
|
4286
|
+
const int mmq_x = 64;
|
4287
|
+
const int mmq_y = 64;
|
4288
|
+
const int nwarps = 8;
|
4289
|
+
|
4290
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4291
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4292
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4293
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4294
|
+
|
4295
|
+
if (nrows_x % mmq_y == 0) {
|
4296
|
+
const bool need_check = false;
|
4297
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4298
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4299
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4300
|
+
} else {
|
4301
|
+
const bool need_check = true;
|
4302
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
+
}
|
3989
4306
|
}
|
3990
4307
|
}
|
3991
4308
|
|
@@ -3993,17 +4310,52 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
3993
4310
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
3994
4311
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
3995
4312
|
|
3996
|
-
|
3997
|
-
|
3998
|
-
const
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4313
|
+
int id;
|
4314
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4315
|
+
const int compute_capability = g_compute_capabilities[id];
|
4316
|
+
|
4317
|
+
if (compute_capability >= CC_TURING) {
|
4318
|
+
const int mmq_x = 64;
|
4319
|
+
const int mmq_y = 128;
|
4320
|
+
const int nwarps = 4;
|
4321
|
+
|
4322
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4323
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4324
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4325
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
+
|
4327
|
+
if (nrows_x % mmq_y == 0) {
|
4328
|
+
const bool need_check = false;
|
4329
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
+
} else {
|
4333
|
+
const bool need_check = true;
|
4334
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
+
}
|
4004
4338
|
} else {
|
4005
|
-
|
4006
|
-
|
4339
|
+
const int mmq_x = 32;
|
4340
|
+
const int mmq_y = 64;
|
4341
|
+
const int nwarps = 8;
|
4342
|
+
|
4343
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4344
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4345
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4346
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4347
|
+
|
4348
|
+
if (nrows_x % mmq_y == 0) {
|
4349
|
+
const bool need_check = false;
|
4350
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4351
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4352
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4353
|
+
} else {
|
4354
|
+
const bool need_check = true;
|
4355
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
+
}
|
4007
4359
|
}
|
4008
4360
|
}
|
4009
4361
|
|
@@ -4011,17 +4363,52 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4011
4363
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4012
4364
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4013
4365
|
|
4014
|
-
|
4015
|
-
|
4016
|
-
const
|
4017
|
-
|
4018
|
-
|
4019
|
-
|
4020
|
-
|
4021
|
-
|
4366
|
+
int id;
|
4367
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4368
|
+
const int compute_capability = g_compute_capabilities[id];
|
4369
|
+
|
4370
|
+
if (compute_capability >= CC_TURING) {
|
4371
|
+
const int mmq_x = 64;
|
4372
|
+
const int mmq_y = 128;
|
4373
|
+
const int nwarps = 4;
|
4374
|
+
|
4375
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4376
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4377
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4378
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
+
|
4380
|
+
if (nrows_x % mmq_y == 0) {
|
4381
|
+
const bool need_check = false;
|
4382
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
+
} else {
|
4386
|
+
const bool need_check = true;
|
4387
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
+
}
|
4022
4391
|
} else {
|
4023
|
-
|
4024
|
-
|
4392
|
+
const int mmq_x = 64;
|
4393
|
+
const int mmq_y = 64;
|
4394
|
+
const int nwarps = 8;
|
4395
|
+
|
4396
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4397
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4398
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4399
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4400
|
+
|
4401
|
+
if (nrows_x % mmq_y == 0) {
|
4402
|
+
const bool need_check = false;
|
4403
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4404
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4405
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4406
|
+
} else {
|
4407
|
+
const bool need_check = true;
|
4408
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
+
}
|
4025
4412
|
}
|
4026
4413
|
}
|
4027
4414
|
|
@@ -4029,17 +4416,52 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4029
4416
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4030
4417
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4031
4418
|
|
4032
|
-
|
4033
|
-
|
4034
|
-
const
|
4035
|
-
|
4036
|
-
|
4037
|
-
|
4038
|
-
|
4039
|
-
|
4419
|
+
int id;
|
4420
|
+
CUDA_CHECK(cudaGetDevice(&id));
|
4421
|
+
const int compute_capability = g_compute_capabilities[id];
|
4422
|
+
|
4423
|
+
if (compute_capability >= CC_TURING) {
|
4424
|
+
const int mmq_x = 64;
|
4425
|
+
const int mmq_y = 64;
|
4426
|
+
const int nwarps = 4;
|
4427
|
+
|
4428
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4429
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4430
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4431
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
+
|
4433
|
+
if (nrows_x % mmq_y == 0) {
|
4434
|
+
const bool need_check = false;
|
4435
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
+
} else {
|
4439
|
+
const bool need_check = true;
|
4440
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
+
}
|
4040
4444
|
} else {
|
4041
|
-
|
4042
|
-
|
4445
|
+
const int mmq_x = 32;
|
4446
|
+
const int mmq_y = 64;
|
4447
|
+
const int nwarps = 8;
|
4448
|
+
|
4449
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4450
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4451
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4452
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4453
|
+
|
4454
|
+
if (nrows_x % mmq_y == 0) {
|
4455
|
+
const bool need_check = false;
|
4456
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4457
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4458
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4459
|
+
} else {
|
4460
|
+
const bool need_check = true;
|
4461
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
+
}
|
4043
4465
|
}
|
4044
4466
|
}
|
4045
4467
|
|
@@ -4214,20 +4636,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|
4214
4636
|
}
|
4215
4637
|
|
4216
4638
|
|
4217
|
-
static void * g_scratch_buffer = nullptr;
|
4218
|
-
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
4219
|
-
static size_t g_scratch_offset = 0;
|
4220
|
-
|
4221
|
-
static int g_device_count = -1;
|
4222
|
-
static int g_main_device = 0;
|
4223
|
-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
4224
|
-
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
4225
|
-
static bool g_mul_mat_q = false;
|
4226
|
-
|
4227
|
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
4228
|
-
|
4229
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
4230
|
-
|
4231
4639
|
void ggml_init_cublas() {
|
4232
4640
|
static bool initialized = false;
|
4233
4641
|
|
@@ -4583,6 +4991,37 @@ inline void ggml_cuda_op_mul_mat_q(
|
|
4583
4991
|
(void) i1;
|
4584
4992
|
}
|
4585
4993
|
|
4994
|
+
static int64_t get_row_rounding(ggml_type type) {
|
4995
|
+
int max_compute_capability = INT_MIN;
|
4996
|
+
for (int id = 0; id < g_device_count; ++id) {
|
4997
|
+
if (max_compute_capability < g_compute_capabilities[id]
|
4998
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
4999
|
+
max_compute_capability = g_compute_capabilities[id];
|
5000
|
+
}
|
5001
|
+
}
|
5002
|
+
|
5003
|
+
switch(type) {
|
5004
|
+
case GGML_TYPE_Q4_0:
|
5005
|
+
case GGML_TYPE_Q4_1:
|
5006
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5007
|
+
case GGML_TYPE_Q5_0:
|
5008
|
+
case GGML_TYPE_Q5_1:
|
5009
|
+
case GGML_TYPE_Q8_0:
|
5010
|
+
return 64;
|
5011
|
+
case GGML_TYPE_F16:
|
5012
|
+
return 1;
|
5013
|
+
case GGML_TYPE_Q2_K:
|
5014
|
+
case GGML_TYPE_Q3_K:
|
5015
|
+
case GGML_TYPE_Q4_K:
|
5016
|
+
case GGML_TYPE_Q5_K:
|
5017
|
+
return max_compute_capability >= CC_TURING ? 128 : 64;
|
5018
|
+
case GGML_TYPE_Q6_K:
|
5019
|
+
return 64;
|
5020
|
+
default:
|
5021
|
+
GGML_ASSERT(false);
|
5022
|
+
}
|
5023
|
+
}
|
5024
|
+
|
4586
5025
|
inline void ggml_cuda_op_mul_mat_vec(
|
4587
5026
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
4588
5027
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -4983,14 +5422,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
4983
5422
|
|
4984
5423
|
int64_t row_low, row_high;
|
4985
5424
|
if (split) {
|
5425
|
+
const int64_t rounding = get_row_rounding(src0->type);
|
5426
|
+
|
4986
5427
|
row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
|
4987
|
-
row_low -= row_low %
|
5428
|
+
row_low -= row_low % rounding;
|
4988
5429
|
|
4989
5430
|
if (id == g_device_count - 1) {
|
4990
5431
|
row_high = nrows0;
|
4991
5432
|
} else {
|
4992
5433
|
row_high = nrows0*g_tensor_split[id + 1];
|
4993
|
-
row_high -= row_high %
|
5434
|
+
row_high -= row_high % rounding;
|
4994
5435
|
}
|
4995
5436
|
} else {
|
4996
5437
|
row_low = 0;
|
@@ -5203,7 +5644,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
5203
5644
|
if (split && g_device_count > 1) {
|
5204
5645
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5205
5646
|
for (int id = 0; id < g_device_count; ++id) {
|
5206
|
-
if (id != g_main_device) {
|
5647
|
+
if (id != g_main_device && src0_extra->events[id]) {
|
5207
5648
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
5208
5649
|
}
|
5209
5650
|
}
|
@@ -5347,7 +5788,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
5347
5788
|
} else {
|
5348
5789
|
int min_compute_capability = INT_MAX;
|
5349
5790
|
for (int id = 0; id < g_device_count; ++id) {
|
5350
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
5791
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
5792
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5351
5793
|
min_compute_capability = g_compute_capabilities[id];
|
5352
5794
|
}
|
5353
5795
|
}
|
@@ -5468,14 +5910,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
5468
5910
|
row_low = 0;
|
5469
5911
|
row_high = nrows;
|
5470
5912
|
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
5913
|
+
const int64_t rounding = get_row_rounding(tensor->type);
|
5914
|
+
|
5471
5915
|
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
5472
|
-
row_low -= row_low %
|
5916
|
+
row_low -= row_low % rounding;
|
5473
5917
|
|
5474
5918
|
if (id == g_device_count - 1) {
|
5475
5919
|
row_high = nrows;
|
5476
5920
|
} else {
|
5477
5921
|
row_high = nrows*g_tensor_split[id + 1];
|
5478
|
-
row_high -= row_high %
|
5922
|
+
row_high -= row_high % rounding;
|
5479
5923
|
}
|
5480
5924
|
} else {
|
5481
5925
|
GGML_ASSERT(false);
|