llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1399,6 +1501,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1399
1501
|
// second part effectively subtracts 8 from each quant value
|
1400
1502
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1401
1503
|
#else
|
1504
|
+
assert(false);
|
1402
1505
|
return 0.0f; // only to satisfy the compiler
|
1403
1506
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1404
1507
|
}
|
@@ -1436,6 +1539,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1436
1539
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1437
1540
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1438
1541
|
#else
|
1542
|
+
assert(false);
|
1439
1543
|
return 0.0f; // only to satisfy the compiler
|
1440
1544
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1441
1545
|
}
|
@@ -1471,6 +1575,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1471
1575
|
// second part effectively subtracts 16 from each quant value
|
1472
1576
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1473
1577
|
#else
|
1578
|
+
assert(false);
|
1474
1579
|
return 0.0f; // only to satisfy the compiler
|
1475
1580
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1476
1581
|
}
|
@@ -1516,6 +1621,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1516
1621
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1517
1622
|
|
1518
1623
|
#else
|
1624
|
+
assert(false);
|
1519
1625
|
return 0.0f; // only to satisfy the compiler
|
1520
1626
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1521
1627
|
}
|
@@ -1537,6 +1643,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
1537
1643
|
|
1538
1644
|
return d8_0*d8_1 * sumi;
|
1539
1645
|
#else
|
1646
|
+
assert(false);
|
1540
1647
|
return 0.0f; // only to satisfy the compiler
|
1541
1648
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1542
1649
|
}
|
@@ -1567,6 +1674,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1567
1674
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1568
1675
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1569
1676
|
#else
|
1677
|
+
assert(false);
|
1570
1678
|
return 0.0f; // only to satisfy the compiler
|
1571
1679
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1572
1680
|
}
|
@@ -1602,6 +1710,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
1602
1710
|
|
1603
1711
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
1712
|
#else
|
1713
|
+
assert(false);
|
1605
1714
|
return 0.0f; // only to satisfy the compiler
|
1606
1715
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
1716
|
}
|
@@ -1639,6 +1748,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
1639
1748
|
|
1640
1749
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
1750
|
#else
|
1751
|
+
assert(false);
|
1642
1752
|
return 0.0f; // only to satisfy the compiler
|
1643
1753
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
1754
|
}
|
@@ -1679,6 +1789,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
1679
1789
|
|
1680
1790
|
return d3 * sumf;
|
1681
1791
|
#else
|
1792
|
+
assert(false);
|
1682
1793
|
return 0.0f; // only to satisfy the compiler
|
1683
1794
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
1795
|
}
|
@@ -1704,6 +1815,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
1704
1815
|
|
1705
1816
|
return d3*d8 * sumi;
|
1706
1817
|
#else
|
1818
|
+
assert(false);
|
1707
1819
|
return 0.0f; // only to satisfy the compiler
|
1708
1820
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
1821
|
}
|
@@ -1737,12 +1849,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
1737
1849
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
1850
|
|
1739
1851
|
#else
|
1852
|
+
assert(false);
|
1740
1853
|
return 0.0f; // only to satisfy the compiler
|
1741
1854
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
1855
|
}
|
1743
1856
|
|
1744
1857
|
// contiguous u/y values
|
1745
|
-
// also used for q5_K
|
1746
1858
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
1859
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
1860
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
@@ -1752,19 +1864,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1752
1864
|
float sumf_m = 0.0f;
|
1753
1865
|
|
1754
1866
|
#pragma unroll
|
1755
|
-
for (int
|
1867
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1756
1868
|
int sumi_d = 0;
|
1757
1869
|
|
1758
1870
|
#pragma unroll
|
1759
|
-
for (int
|
1760
|
-
sumi_d = __dp4a(v[
|
1761
|
-
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1871
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1872
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1762
1873
|
}
|
1763
1874
|
|
1764
|
-
const float2 ds8f = __half22float2(ds8[
|
1875
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1765
1876
|
|
1766
|
-
sumf_d += ds8f.x * (sc[
|
1767
|
-
sumf_m += ds8f.y * m[
|
1877
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1878
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1768
1879
|
}
|
1769
1880
|
|
1770
1881
|
const float2 dm4f = __half22float2(dm4);
|
@@ -1772,6 +1883,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1772
1883
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
1884
|
|
1774
1885
|
#else
|
1886
|
+
assert(false);
|
1775
1887
|
return 0.0f; // only to satisfy the compiler
|
1776
1888
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
1889
|
}
|
@@ -1780,7 +1892,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1780
1892
|
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
1893
|
|
1782
1894
|
// contiguous v/x values
|
1783
|
-
static __device__ __forceinline__ float
|
1895
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1784
1896
|
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
1897
|
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
1898
|
|
@@ -1812,6 +1924,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
|
1812
1924
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
1925
|
|
1814
1926
|
#else
|
1927
|
+
assert(false);
|
1928
|
+
return 0.0f; // only to satisfy the compiler
|
1929
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
// contiguous u/y values
|
1933
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1934
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1935
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1936
|
+
|
1937
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1938
|
+
float sumf_d = 0.0f;
|
1939
|
+
float sumf_m = 0.0f;
|
1940
|
+
|
1941
|
+
#pragma unroll
|
1942
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1943
|
+
int sumi_d = 0;
|
1944
|
+
|
1945
|
+
#pragma unroll
|
1946
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1947
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1951
|
+
|
1952
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1953
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1954
|
+
}
|
1955
|
+
|
1956
|
+
const float2 dm4f = __half22float2(dm4);
|
1957
|
+
|
1958
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1959
|
+
|
1960
|
+
#else
|
1961
|
+
assert(false);
|
1815
1962
|
return 0.0f; // only to satisfy the compiler
|
1816
1963
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
1964
|
}
|
@@ -1842,6 +1989,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
1842
1989
|
|
1843
1990
|
return d*sumf;
|
1844
1991
|
#else
|
1992
|
+
assert(false);
|
1845
1993
|
return 0.0f; // only to satisfy the compiler
|
1846
1994
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
1995
|
}
|
@@ -1873,6 +2021,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
1873
2021
|
return d6 * sumf_d;
|
1874
2022
|
|
1875
2023
|
#else
|
2024
|
+
assert(false);
|
1876
2025
|
return 0.0f; // only to satisfy the compiler
|
1877
2026
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
2027
|
}
|
@@ -2298,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2298
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2299
2448
|
}
|
2300
2449
|
|
2301
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2302
2451
|
}
|
2303
2452
|
|
2304
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2384,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2384
2533
|
#pragma unroll
|
2385
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2386
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2387
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2388
2537
|
}
|
2389
2538
|
|
2390
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2503,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2503
2652
|
#pragma unroll
|
2504
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2505
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2506
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2507
2656
|
}
|
2508
2657
|
|
2509
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2672,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2672
2821
|
|
2673
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2674
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2675
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2676
2825
|
|
2677
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2678
2827
|
u[2*i+0] = q8[0];
|
@@ -2699,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2699
2848
|
const float dall = bq4_K->d[0];
|
2700
2849
|
const float dmin = bq4_K->d[1];
|
2701
2850
|
|
2702
|
-
const float d8_1 = bq8_1[0].ds
|
2703
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2704
2853
|
|
2705
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2706
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2722,6 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2722
2871
|
return dall * sumf_d - dmin * sumf_m;
|
2723
2872
|
|
2724
2873
|
#else
|
2874
|
+
assert(false);
|
2725
2875
|
return 0.0f; // only to satisfy the compiler
|
2726
2876
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2727
2877
|
|
@@ -2808,18 +2958,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
|
2808
2958
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
2959
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2810
2960
|
|
2811
|
-
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2812
|
-
|
2813
|
-
#pragma unroll
|
2814
|
-
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
-
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
-
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2817
|
-
}
|
2818
|
-
|
2819
2961
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2820
2962
|
|
2821
2963
|
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
-
return vec_dot_q4_K_q8_1_impl_mmq(
|
2964
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2965
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2823
2966
|
}
|
2824
2967
|
|
2825
2968
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2859,14 +3002,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2859
3002
|
#pragma unroll
|
2860
3003
|
for (int i = 0; i < QR5_K; ++i) {
|
2861
3004
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2862
|
-
d8[i] = bq8i->ds
|
3005
|
+
d8[i] = __low2float(bq8i->ds);
|
2863
3006
|
|
2864
3007
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2865
3008
|
u[2*i+0] = q8[0];
|
2866
3009
|
u[2*i+1] = q8[4];
|
2867
3010
|
}
|
2868
3011
|
|
2869
|
-
return
|
3012
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2870
3013
|
|
2871
3014
|
#else
|
2872
3015
|
|
@@ -2877,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2877
3020
|
|
2878
3021
|
const float d = bq5_K->d;
|
2879
3022
|
|
2880
|
-
const float d8_1 = bq8_1[0].ds
|
2881
|
-
const float d8_2 = bq8_1[1].ds
|
3023
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3024
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2882
3025
|
|
2883
3026
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2884
3027
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2905,6 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2905
3048
|
return d * sumf_d;
|
2906
3049
|
|
2907
3050
|
#else
|
3051
|
+
assert(false);
|
2908
3052
|
return 0.0f; // only to satisfy the compiler
|
2909
3053
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2910
3054
|
|
@@ -3008,7 +3152,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
3008
3152
|
|
3009
3153
|
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
3154
|
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
-
return
|
3155
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3156
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
3012
3157
|
}
|
3013
3158
|
|
3014
3159
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -3031,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3031
3176
|
#pragma unroll
|
3032
3177
|
for (int i = 0; i < QR6_K; ++i) {
|
3033
3178
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3034
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3179
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3035
3180
|
}
|
3036
3181
|
|
3037
3182
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3135,7 +3280,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3135
3280
|
|
3136
3281
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3137
3282
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3138
|
-
static
|
3283
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3139
3284
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3140
3285
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3141
3286
|
|
@@ -3150,7 +3295,6 @@ static __global__ void mul_mat_q(
|
|
3150
3295
|
|
3151
3296
|
const int row_dst_0 = blockIdx.x*mmq_y;
|
3152
3297
|
const int & row_x_0 = row_dst_0;
|
3153
|
-
const int row_dst = row_dst_0 + threadIdx.x;
|
3154
3298
|
|
3155
3299
|
const int col_dst_0 = blockIdx.y*mmq_x;
|
3156
3300
|
const int & col_y_0 = col_dst_0;
|
@@ -3200,7 +3344,7 @@ static __global__ void mul_mat_q(
|
|
3200
3344
|
*dsi_dst = *dsi_src;
|
3201
3345
|
} else {
|
3202
3346
|
float * dfi_dst = (float *) dsi_dst;
|
3203
|
-
*dfi_dst = (*dsi_src)
|
3347
|
+
*dfi_dst = __low2half(*dsi_src);
|
3204
3348
|
}
|
3205
3349
|
}
|
3206
3350
|
|
@@ -3223,11 +3367,7 @@ static __global__ void mul_mat_q(
|
|
3223
3367
|
}
|
3224
3368
|
}
|
3225
3369
|
|
3226
|
-
|
3227
|
-
if (row_dst >= nrows_dst) {
|
3228
|
-
return;
|
3229
|
-
}
|
3230
|
-
|
3370
|
+
#pragma unroll
|
3231
3371
|
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
3372
|
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3233
3373
|
|
@@ -3235,12 +3375,375 @@ static __global__ void mul_mat_q(
|
|
3235
3375
|
return;
|
3236
3376
|
}
|
3237
3377
|
|
3378
|
+
#pragma unroll
|
3238
3379
|
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
-
|
3380
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3381
|
+
|
3382
|
+
if (row_dst >= nrows_dst) {
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3386
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3240
3387
|
}
|
3241
3388
|
}
|
3242
3389
|
}
|
3243
3390
|
|
3391
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3392
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3393
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3394
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3395
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3396
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3397
|
+
|
3398
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3399
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3400
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3401
|
+
|
3402
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3403
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3404
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3405
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3406
|
+
|
3407
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3408
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3409
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3410
|
+
|
3411
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3412
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3413
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3414
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3415
|
+
|
3416
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3417
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3418
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3419
|
+
#else
|
3420
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3421
|
+
assert(false);
|
3422
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3423
|
+
}
|
3424
|
+
|
3425
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3426
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3427
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3428
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3429
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3430
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3431
|
+
|
3432
|
+
template <bool need_check> static __global__ void
|
3433
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3434
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3435
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3436
|
+
mul_mat_q4_1(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3446
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3455
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3464
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3465
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3466
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3468
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3480
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3489
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3499
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3500
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3502
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3505
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3506
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3507
|
+
|
3508
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3509
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3510
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3511
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3512
|
+
|
3513
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3514
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3516
|
+
|
3517
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3518
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3519
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3520
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3521
|
+
|
3522
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3523
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3524
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3525
|
+
#else
|
3526
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3527
|
+
assert(false);
|
3528
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3529
|
+
}
|
3530
|
+
|
3531
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3532
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3533
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3534
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3535
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3536
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3537
|
+
|
3538
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3539
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3540
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3541
|
+
|
3542
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3543
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3544
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3545
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3546
|
+
|
3547
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3548
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3549
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3550
|
+
|
3551
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3552
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3553
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3554
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3555
|
+
|
3556
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3557
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3558
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3559
|
+
#else
|
3560
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3561
|
+
assert(false);
|
3562
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3563
|
+
}
|
3564
|
+
|
3565
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3566
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3567
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3568
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3569
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3570
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3571
|
+
|
3572
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3573
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3574
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3575
|
+
|
3576
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3577
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3578
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3579
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3580
|
+
|
3581
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3582
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3583
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3584
|
+
|
3585
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3586
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3587
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3588
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3589
|
+
|
3590
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3591
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3592
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3593
|
+
#else
|
3594
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3595
|
+
assert(false);
|
3596
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3600
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3601
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3602
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3603
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3604
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3605
|
+
|
3606
|
+
template <bool need_check> static __global__ void
|
3607
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3608
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3609
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3610
|
+
mul_mat_q3_K(
|
3611
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3612
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3613
|
+
|
3614
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3615
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3616
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3617
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3618
|
+
|
3619
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3620
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3621
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3622
|
+
|
3623
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3624
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3625
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3626
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3627
|
+
|
3628
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3629
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3631
|
+
#else
|
3632
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3633
|
+
assert(false);
|
3634
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3635
|
+
}
|
3636
|
+
|
3637
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3638
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3639
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3640
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3641
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3642
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3643
|
+
|
3644
|
+
template <bool need_check> static __global__ void
|
3645
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3646
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3647
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3648
|
+
mul_mat_q4_K(
|
3649
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3650
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3651
|
+
|
3652
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3653
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3654
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3655
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3656
|
+
|
3657
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3658
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3659
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3660
|
+
|
3661
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3662
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3663
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3664
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3665
|
+
|
3666
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3667
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3668
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3669
|
+
#else
|
3670
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3671
|
+
assert(false);
|
3672
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3673
|
+
}
|
3674
|
+
|
3675
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3676
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3677
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3678
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3679
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3680
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3681
|
+
|
3682
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3683
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3684
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3685
|
+
|
3686
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3687
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3688
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3689
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3690
|
+
|
3691
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3692
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3693
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3694
|
+
|
3695
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3696
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3697
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3698
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3699
|
+
|
3700
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3701
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3702
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3703
|
+
#else
|
3704
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3705
|
+
assert(false);
|
3706
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3707
|
+
}
|
3708
|
+
|
3709
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3710
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3711
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3712
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3713
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3714
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3715
|
+
|
3716
|
+
template <bool need_check> static __global__ void
|
3717
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3718
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3719
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3720
|
+
mul_mat_q6_K(
|
3721
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3722
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3723
|
+
|
3724
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3725
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3726
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3727
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3728
|
+
|
3729
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3730
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3731
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3732
|
+
|
3733
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3734
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3735
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3736
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3737
|
+
|
3738
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3739
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3740
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3741
|
+
#else
|
3742
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3743
|
+
assert(false);
|
3744
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3745
|
+
}
|
3746
|
+
|
3244
3747
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3245
3748
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3246
3749
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3485,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3485
3988
|
// rope == RoPE == rotary positional embedding
|
3486
3989
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3487
3990
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3488
|
-
const int col = 2*(blockDim.
|
3991
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3489
3992
|
|
3490
3993
|
if (col >= ncols) {
|
3491
3994
|
return;
|
3492
3995
|
}
|
3493
3996
|
|
3494
|
-
const int row = blockDim.
|
3997
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3495
3998
|
const int i = row*ncols + col;
|
3496
3999
|
|
3497
4000
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3505,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3505
4008
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3506
4009
|
}
|
3507
4010
|
|
4011
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4012
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4013
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4014
|
+
|
4015
|
+
if (col >= ncols) {
|
4016
|
+
return;
|
4017
|
+
}
|
4018
|
+
|
4019
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4020
|
+
const int i = row*ncols + col/2;
|
4021
|
+
|
4022
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4023
|
+
const float sin_theta = sinf(theta);
|
4024
|
+
const float cos_theta = cosf(theta);
|
4025
|
+
|
4026
|
+
const float x0 = x[i + 0];
|
4027
|
+
const float x1 = x[i + ncols/2];
|
4028
|
+
|
4029
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4030
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4031
|
+
}
|
4032
|
+
|
3508
4033
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3509
4034
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3510
4035
|
const int half_n_dims = ncols/4;
|
@@ -3539,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3539
4064
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3540
4065
|
}
|
3541
4066
|
|
3542
|
-
static __global__ void
|
4067
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4068
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3543
4069
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4070
|
+
|
4071
|
+
if (col >= ncols) {
|
4072
|
+
return;
|
4073
|
+
}
|
4074
|
+
|
3544
4075
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4076
|
+
const int i = row*ncols + col;
|
4077
|
+
|
4078
|
+
const int k = row/k_rows;
|
4079
|
+
|
4080
|
+
float m_k;
|
4081
|
+
if (k < n_heads_log2_floor) {
|
4082
|
+
m_k = powf(m0, k + 1);
|
4083
|
+
} else {
|
4084
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
dst[i] = col * m_k + x[i];
|
4088
|
+
}
|
4089
|
+
|
4090
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4091
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4092
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3545
4093
|
|
3546
4094
|
if (col >= ncols) {
|
3547
4095
|
return;
|
@@ -3554,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3554
4102
|
|
3555
4103
|
// the CUDA soft max implementation differs from the CPU implementation
|
3556
4104
|
// instead of doubles floats are used
|
3557
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3558
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3559
4105
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3560
|
-
const int row = blockDim.
|
3561
|
-
const int block_size = blockDim.
|
3562
|
-
const int tid = threadIdx.
|
4106
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4107
|
+
const int block_size = blockDim.y;
|
4108
|
+
const int tid = threadIdx.y;
|
3563
4109
|
|
3564
|
-
float
|
4110
|
+
float max_val = -INFINITY;
|
3565
4111
|
|
3566
|
-
for (int
|
3567
|
-
const int
|
4112
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4113
|
+
const int i = row*ncols + col;
|
4114
|
+
max_val = max(max_val, x[i]);
|
4115
|
+
}
|
3568
4116
|
|
3569
|
-
|
3570
|
-
|
3571
|
-
|
4117
|
+
// find the max value in the block
|
4118
|
+
#pragma unroll
|
4119
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4120
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4121
|
+
}
|
3572
4122
|
|
4123
|
+
float tmp = 0.f;
|
4124
|
+
|
4125
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3573
4126
|
const int i = row*ncols + col;
|
3574
|
-
const float val = expf(x[i]);
|
4127
|
+
const float val = expf(x[i] - max_val);
|
3575
4128
|
tmp += val;
|
3576
4129
|
dst[i] = val;
|
3577
4130
|
}
|
@@ -3582,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3582
4135
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3583
4136
|
}
|
3584
4137
|
|
3585
|
-
|
3586
|
-
const int col = block_start + tid;
|
3587
|
-
|
3588
|
-
if (col >= ncols) {
|
3589
|
-
break;
|
3590
|
-
}
|
4138
|
+
const float inv_tmp = 1.f / tmp;
|
3591
4139
|
|
4140
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3592
4141
|
const int i = row*ncols + col;
|
3593
|
-
dst[i]
|
4142
|
+
dst[i] *= inv_tmp;
|
3594
4143
|
}
|
3595
4144
|
}
|
3596
4145
|
|
@@ -3942,48 +4491,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3942
4491
|
CUDA_CHECK(cudaGetDevice(&id));
|
3943
4492
|
const int compute_capability = g_compute_capabilities[id];
|
3944
4493
|
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
3945
4495
|
if (compute_capability >= CC_TURING) {
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
3950
|
-
|
3951
|
-
|
3952
|
-
|
3953
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
-
|
3955
|
-
if (nrows_x % mmq_y == 0) {
|
3956
|
-
const bool need_check = false;
|
3957
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
-
} else {
|
3961
|
-
const bool need_check = true;
|
3962
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
-
}
|
4496
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
3966
4503
|
} else {
|
3967
|
-
|
3968
|
-
|
3969
|
-
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
3973
|
-
|
3974
|
-
|
3975
|
-
|
3976
|
-
|
3977
|
-
|
3978
|
-
|
3979
|
-
|
3980
|
-
|
3981
|
-
|
3982
|
-
|
3983
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
-
}
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4509
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4511
|
+
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4516
|
+
} else {
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3987
4520
|
}
|
3988
4521
|
}
|
3989
4522
|
|
@@ -3995,49 +4528,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3995
4528
|
CUDA_CHECK(cudaGetDevice(&id));
|
3996
4529
|
const int compute_capability = g_compute_capabilities[id];
|
3997
4530
|
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
3998
4532
|
if (compute_capability >= CC_TURING) {
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4004
|
-
|
4005
|
-
|
4006
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
-
|
4008
|
-
if (nrows_x % mmq_y == 0) {
|
4009
|
-
const bool need_check = false;
|
4010
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
-
} else {
|
4014
|
-
const bool need_check = true;
|
4015
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
-
}
|
4533
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4019
4540
|
} else {
|
4020
|
-
|
4021
|
-
|
4022
|
-
const int nwarps = 8;
|
4023
|
-
|
4024
|
-
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
-
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
-
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
-
|
4029
|
-
if (nrows_x % mmq_y == 0) {
|
4030
|
-
const bool need_check = false;
|
4031
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
-
} else {
|
4035
|
-
const bool need_check = true;
|
4036
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
-
}
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4040
4543
|
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4546
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4548
|
+
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4553
|
+
} else {
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4041
4557
|
}
|
4042
4558
|
}
|
4043
4559
|
|
@@ -4049,48 +4565,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4049
4565
|
CUDA_CHECK(cudaGetDevice(&id));
|
4050
4566
|
const int compute_capability = g_compute_capabilities[id];
|
4051
4567
|
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4052
4569
|
if (compute_capability >= CC_TURING) {
|
4053
|
-
|
4054
|
-
|
4055
|
-
|
4056
|
-
|
4057
|
-
|
4058
|
-
|
4059
|
-
|
4060
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
-
|
4062
|
-
if (nrows_x % mmq_y == 0) {
|
4063
|
-
const bool need_check = false;
|
4064
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
-
} else {
|
4068
|
-
const bool need_check = true;
|
4069
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
-
}
|
4570
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4073
4577
|
} else {
|
4074
|
-
|
4075
|
-
|
4076
|
-
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4080
|
-
|
4081
|
-
|
4082
|
-
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4086
|
-
|
4087
|
-
|
4088
|
-
|
4089
|
-
|
4090
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
-
}
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4583
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4585
|
+
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4590
|
+
} else {
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4094
4594
|
}
|
4095
4595
|
}
|
4096
4596
|
|
@@ -4102,48 +4602,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4102
4602
|
CUDA_CHECK(cudaGetDevice(&id));
|
4103
4603
|
const int compute_capability = g_compute_capabilities[id];
|
4104
4604
|
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4105
4606
|
if (compute_capability >= CC_TURING) {
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
-
|
4115
|
-
if (nrows_x % mmq_y == 0) {
|
4116
|
-
const bool need_check = false;
|
4117
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
-
} else {
|
4121
|
-
const bool need_check = true;
|
4122
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
-
}
|
4607
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4126
4614
|
} else {
|
4127
|
-
|
4128
|
-
|
4129
|
-
|
4130
|
-
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4135
|
-
|
4136
|
-
|
4137
|
-
|
4138
|
-
|
4139
|
-
|
4140
|
-
|
4141
|
-
|
4142
|
-
|
4143
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
-
}
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4620
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4622
|
+
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4627
|
+
} else {
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4147
4631
|
}
|
4148
4632
|
}
|
4149
4633
|
|
@@ -4155,48 +4639,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4155
4639
|
CUDA_CHECK(cudaGetDevice(&id));
|
4156
4640
|
const int compute_capability = g_compute_capabilities[id];
|
4157
4641
|
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4158
4643
|
if (compute_capability >= CC_TURING) {
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
4166
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
-
|
4168
|
-
if (nrows_x % mmq_y == 0) {
|
4169
|
-
const bool need_check = false;
|
4170
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
-
} else {
|
4174
|
-
const bool need_check = true;
|
4175
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
-
}
|
4644
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4179
4651
|
} else {
|
4180
|
-
|
4181
|
-
|
4182
|
-
|
4183
|
-
|
4184
|
-
|
4185
|
-
|
4186
|
-
|
4187
|
-
|
4188
|
-
|
4189
|
-
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
|
4196
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
-
}
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4657
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4659
|
+
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4664
|
+
} else {
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4200
4668
|
}
|
4201
4669
|
}
|
4202
4670
|
|
@@ -4208,48 +4676,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4208
4676
|
CUDA_CHECK(cudaGetDevice(&id));
|
4209
4677
|
const int compute_capability = g_compute_capabilities[id];
|
4210
4678
|
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4211
4680
|
if (compute_capability >= CC_TURING) {
|
4212
|
-
|
4213
|
-
|
4214
|
-
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
4218
|
-
|
4219
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
-
|
4221
|
-
if (nrows_x % mmq_y == 0) {
|
4222
|
-
const bool need_check = false;
|
4223
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
-
} else {
|
4227
|
-
const bool need_check = true;
|
4228
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
-
}
|
4681
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4232
4688
|
} else {
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4242
|
-
|
4243
|
-
|
4244
|
-
|
4245
|
-
|
4246
|
-
|
4247
|
-
|
4248
|
-
|
4249
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
-
}
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4694
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4696
|
+
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4701
|
+
} else {
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4253
4705
|
}
|
4254
4706
|
}
|
4255
4707
|
|
@@ -4261,48 +4713,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4261
4713
|
CUDA_CHECK(cudaGetDevice(&id));
|
4262
4714
|
const int compute_capability = g_compute_capabilities[id];
|
4263
4715
|
|
4716
|
+
int mmq_x, mmq_y, nwarps;
|
4264
4717
|
if (compute_capability >= CC_TURING) {
|
4265
|
-
|
4266
|
-
|
4267
|
-
|
4268
|
-
|
4269
|
-
|
4270
|
-
|
4271
|
-
|
4272
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
-
|
4274
|
-
if (nrows_x % mmq_y == 0) {
|
4275
|
-
const bool need_check = false;
|
4276
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
-
} else {
|
4280
|
-
const bool need_check = true;
|
4281
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
-
}
|
4718
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4719
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4720
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4721
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4722
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4723
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4724
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4285
4725
|
} else {
|
4286
|
-
|
4287
|
-
|
4288
|
-
|
4289
|
-
|
4290
|
-
|
4291
|
-
|
4292
|
-
|
4293
|
-
|
4294
|
-
|
4295
|
-
|
4296
|
-
|
4297
|
-
|
4298
|
-
|
4299
|
-
|
4300
|
-
|
4301
|
-
|
4302
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
-
}
|
4726
|
+
GGML_ASSERT(false);
|
4727
|
+
}
|
4728
|
+
|
4729
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4730
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4731
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4732
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4733
|
+
|
4734
|
+
if (nrows_x % mmq_y == 0) {
|
4735
|
+
const bool need_check = false;
|
4736
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4737
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4738
|
+
} else {
|
4739
|
+
const bool need_check = true;
|
4740
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4306
4742
|
}
|
4307
4743
|
}
|
4308
4744
|
|
@@ -4314,48 +4750,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4314
4750
|
CUDA_CHECK(cudaGetDevice(&id));
|
4315
4751
|
const int compute_capability = g_compute_capabilities[id];
|
4316
4752
|
|
4753
|
+
int mmq_x, mmq_y, nwarps;
|
4317
4754
|
if (compute_capability >= CC_TURING) {
|
4318
|
-
|
4319
|
-
|
4320
|
-
|
4321
|
-
|
4322
|
-
|
4323
|
-
|
4324
|
-
|
4325
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
-
|
4327
|
-
if (nrows_x % mmq_y == 0) {
|
4328
|
-
const bool need_check = false;
|
4329
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
-
} else {
|
4333
|
-
const bool need_check = true;
|
4334
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
-
}
|
4755
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4756
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4757
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4758
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4759
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4760
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4761
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4338
4762
|
} else {
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
|
4355
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
-
}
|
4763
|
+
GGML_ASSERT(false);
|
4764
|
+
}
|
4765
|
+
|
4766
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4767
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4768
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4769
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4770
|
+
|
4771
|
+
if (nrows_x % mmq_y == 0) {
|
4772
|
+
const bool need_check = false;
|
4773
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4774
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4775
|
+
} else {
|
4776
|
+
const bool need_check = true;
|
4777
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4778
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4779
|
}
|
4360
4780
|
}
|
4361
4781
|
|
@@ -4367,48 +4787,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4367
4787
|
CUDA_CHECK(cudaGetDevice(&id));
|
4368
4788
|
const int compute_capability = g_compute_capabilities[id];
|
4369
4789
|
|
4790
|
+
int mmq_x, mmq_y, nwarps;
|
4370
4791
|
if (compute_capability >= CC_TURING) {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
|
4377
|
-
|
4378
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
-
|
4380
|
-
if (nrows_x % mmq_y == 0) {
|
4381
|
-
const bool need_check = false;
|
4382
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
-
} else {
|
4386
|
-
const bool need_check = true;
|
4387
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
-
}
|
4792
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4793
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4794
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4795
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4796
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4797
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4798
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4391
4799
|
} else {
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4397
|
-
|
4398
|
-
|
4399
|
-
|
4400
|
-
|
4401
|
-
|
4402
|
-
|
4403
|
-
|
4404
|
-
|
4405
|
-
|
4406
|
-
|
4407
|
-
|
4408
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
-
}
|
4800
|
+
GGML_ASSERT(false);
|
4801
|
+
}
|
4802
|
+
|
4803
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4804
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4805
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4806
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4807
|
+
|
4808
|
+
if (nrows_x % mmq_y == 0) {
|
4809
|
+
const bool need_check = false;
|
4810
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4811
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4812
|
+
} else {
|
4813
|
+
const bool need_check = true;
|
4814
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4815
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4412
4816
|
}
|
4413
4817
|
}
|
4414
4818
|
|
@@ -4420,48 +4824,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4420
4824
|
CUDA_CHECK(cudaGetDevice(&id));
|
4421
4825
|
const int compute_capability = g_compute_capabilities[id];
|
4422
4826
|
|
4827
|
+
int mmq_x, mmq_y, nwarps;
|
4423
4828
|
if (compute_capability >= CC_TURING) {
|
4424
|
-
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
-
|
4433
|
-
if (nrows_x % mmq_y == 0) {
|
4434
|
-
const bool need_check = false;
|
4435
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
-
} else {
|
4439
|
-
const bool need_check = true;
|
4440
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
-
}
|
4829
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4830
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4831
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4832
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4833
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4834
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4835
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4444
4836
|
} else {
|
4445
|
-
|
4446
|
-
|
4447
|
-
|
4448
|
-
|
4449
|
-
|
4450
|
-
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4456
|
-
|
4457
|
-
|
4458
|
-
|
4459
|
-
|
4460
|
-
|
4461
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
-
}
|
4837
|
+
GGML_ASSERT(false);
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4841
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4842
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4843
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4844
|
+
|
4845
|
+
if (nrows_x % mmq_y == 0) {
|
4846
|
+
const bool need_check = false;
|
4847
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4848
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4849
|
+
} else {
|
4850
|
+
const bool need_check = true;
|
4851
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4852
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4465
4853
|
}
|
4466
4854
|
}
|
4467
4855
|
|
@@ -4511,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4511
4899
|
|
4512
4900
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4513
4901
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4514
|
-
GGML_ASSERT(nrows % 2 == 0);
|
4515
|
-
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1
|
4902
|
+
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
4903
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4516
4904
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4517
|
-
const dim3 block_nums(
|
4905
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4518
4906
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4519
4907
|
}
|
4520
4908
|
|
4909
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4915
|
+
}
|
4916
|
+
|
4521
4917
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4522
4918
|
GGML_ASSERT(nrows % 4 == 0);
|
4523
4919
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4526,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4526
4922
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4527
4923
|
}
|
4528
4924
|
|
4925
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4926
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4927
|
+
const float m1, cudaStream_t stream) {
|
4928
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4929
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4930
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4931
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4932
|
+
}
|
4933
|
+
|
4529
4934
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4530
|
-
const dim3 block_dims(
|
4935
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4531
4936
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4532
|
-
const dim3 block_nums(
|
4937
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4533
4938
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4534
4939
|
}
|
4535
4940
|
|
4536
4941
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4537
|
-
const dim3 block_dims(
|
4538
|
-
const dim3 block_nums(
|
4942
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4943
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4539
4944
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4540
4945
|
}
|
4541
4946
|
|
@@ -4640,10 +5045,18 @@ void ggml_init_cublas() {
|
|
4640
5045
|
static bool initialized = false;
|
4641
5046
|
|
4642
5047
|
if (!initialized) {
|
5048
|
+
|
5049
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5050
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5051
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5052
|
+
rocblas_initialize();
|
5053
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5054
|
+
#endif
|
5055
|
+
|
4643
5056
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4644
5057
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4645
5058
|
int64_t total_vram = 0;
|
4646
|
-
fprintf(stderr, "%s: found %d
|
5059
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4647
5060
|
for (int id = 0; id < g_device_count; ++id) {
|
4648
5061
|
cudaDeviceProp prop;
|
4649
5062
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5241,7 +5654,8 @@ inline void ggml_cuda_op_rope(
|
|
5241
5654
|
|
5242
5655
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5243
5656
|
|
5244
|
-
const bool
|
5657
|
+
const bool is_neox = mode & 2;
|
5658
|
+
const bool is_glm = mode & 4;
|
5245
5659
|
|
5246
5660
|
// compute
|
5247
5661
|
if (is_glm) {
|
@@ -5249,6 +5663,10 @@ inline void ggml_cuda_op_rope(
|
|
5249
5663
|
const float id_p = min(p, n_ctx - 2.f);
|
5250
5664
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5251
5665
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5666
|
+
} else if (is_neox) {
|
5667
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5668
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5669
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5252
5670
|
} else {
|
5253
5671
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5254
5672
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5261,6 +5679,41 @@ inline void ggml_cuda_op_rope(
|
|
5261
5679
|
(void) i1;
|
5262
5680
|
}
|
5263
5681
|
|
5682
|
+
inline void ggml_cuda_op_alibi(
|
5683
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5684
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5685
|
+
cudaStream_t & cudaStream_main){
|
5686
|
+
|
5687
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5688
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5689
|
+
|
5690
|
+
const int64_t ne00 = src0->ne[0];
|
5691
|
+
const int64_t ne01 = src0->ne[1];
|
5692
|
+
const int64_t ne02 = src0->ne[2];
|
5693
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5694
|
+
|
5695
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5696
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5697
|
+
float max_bias;
|
5698
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5699
|
+
|
5700
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5701
|
+
GGML_ASSERT(n_head == ne02);
|
5702
|
+
|
5703
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5704
|
+
|
5705
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5706
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5707
|
+
|
5708
|
+
// compute
|
5709
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5710
|
+
|
5711
|
+
(void) src1;
|
5712
|
+
(void) src0_ddq_i;
|
5713
|
+
(void) src1_ddf_i;
|
5714
|
+
(void) i1;
|
5715
|
+
}
|
5716
|
+
|
5264
5717
|
inline void ggml_cuda_op_diag_mask_inf(
|
5265
5718
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5266
5719
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -5881,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|
5881
6334
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
5882
6335
|
}
|
5883
6336
|
|
6337
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6338
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6339
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6340
|
+
}
|
6341
|
+
|
5884
6342
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
5885
6343
|
(void) src0;
|
5886
6344
|
(void) src1;
|
@@ -6000,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6000
6458
|
return extra;
|
6001
6459
|
}
|
6002
6460
|
|
6003
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6461
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6004
6462
|
if (scratch && g_scratch_size == 0) {
|
6005
6463
|
return;
|
6006
6464
|
}
|
@@ -6009,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6009
6467
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6010
6468
|
const ggml_op src0_op = tensor->src[0]->op;
|
6011
6469
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6012
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6470
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6013
6471
|
}
|
6014
6472
|
}
|
6015
6473
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6016
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6474
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6017
6475
|
}
|
6018
6476
|
|
6019
6477
|
tensor->backend = GGML_BACKEND_GPU;
|
6478
|
+
|
6479
|
+
if (scratch && no_alloc) {
|
6480
|
+
return;
|
6481
|
+
}
|
6482
|
+
|
6020
6483
|
struct ggml_tensor_extra_gpu * extra;
|
6021
6484
|
|
6022
6485
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6068,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6068
6531
|
tensor->extra = extra;
|
6069
6532
|
}
|
6070
6533
|
|
6534
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6535
|
+
if (g_scratch_size == 0) {
|
6536
|
+
return;
|
6537
|
+
}
|
6538
|
+
if (g_scratch_buffer == nullptr) {
|
6539
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6540
|
+
}
|
6541
|
+
|
6542
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6543
|
+
|
6544
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6545
|
+
tensor->op == GGML_OP_VIEW;
|
6546
|
+
|
6547
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6548
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6549
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6550
|
+
size_t view_offset = 0;
|
6551
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6552
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6553
|
+
}
|
6554
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6555
|
+
} else {
|
6556
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6557
|
+
}
|
6558
|
+
|
6559
|
+
tensor->extra = extra;
|
6560
|
+
}
|
6561
|
+
|
6071
6562
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6072
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6563
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6564
|
+
}
|
6565
|
+
|
6566
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6567
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6073
6568
|
}
|
6074
6569
|
|
6075
6570
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6076
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6571
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6077
6572
|
}
|
6078
6573
|
|
6079
6574
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6080
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6081
6576
|
}
|
6082
6577
|
|
6083
6578
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6216,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6216
6711
|
}
|
6217
6712
|
func = ggml_cuda_rope;
|
6218
6713
|
break;
|
6714
|
+
case GGML_OP_ALIBI:
|
6715
|
+
if (!any_on_device) {
|
6716
|
+
return false;
|
6717
|
+
}
|
6718
|
+
func = ggml_cuda_alibi;
|
6719
|
+
break;
|
6219
6720
|
default:
|
6220
6721
|
return false;
|
6221
6722
|
}
|
@@ -6229,3 +6730,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6229
6730
|
func(tensor->src[0], tensor->src[1], tensor);
|
6230
6731
|
return true;
|
6231
6732
|
}
|
6733
|
+
|
6734
|
+
int ggml_cuda_get_device_count() {
|
6735
|
+
int device_count;
|
6736
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6737
|
+
return device_count;
|
6738
|
+
}
|
6739
|
+
|
6740
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6741
|
+
cudaDeviceProp prop;
|
6742
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6743
|
+
snprintf(description, description_size, "%s", prop.name);
|
6744
|
+
}
|