llama_cpp 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -1399,6 +1501,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|
1399
1501
|
// second part effectively subtracts 8 from each quant value
|
1400
1502
|
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
1401
1503
|
#else
|
1504
|
+
assert(false);
|
1402
1505
|
return 0.0f; // only to satisfy the compiler
|
1403
1506
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1404
1507
|
}
|
@@ -1436,6 +1539,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|
1436
1539
|
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
1437
1540
|
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
1438
1541
|
#else
|
1542
|
+
assert(false);
|
1439
1543
|
return 0.0f; // only to satisfy the compiler
|
1440
1544
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1441
1545
|
}
|
@@ -1471,6 +1575,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|
1471
1575
|
// second part effectively subtracts 16 from each quant value
|
1472
1576
|
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
1473
1577
|
#else
|
1578
|
+
assert(false);
|
1474
1579
|
return 0.0f; // only to satisfy the compiler
|
1475
1580
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1476
1581
|
}
|
@@ -1516,6 +1621,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|
1516
1621
|
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
1517
1622
|
|
1518
1623
|
#else
|
1624
|
+
assert(false);
|
1519
1625
|
return 0.0f; // only to satisfy the compiler
|
1520
1626
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1521
1627
|
}
|
@@ -1537,6 +1643,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|
1537
1643
|
|
1538
1644
|
return d8_0*d8_1 * sumi;
|
1539
1645
|
#else
|
1646
|
+
assert(false);
|
1540
1647
|
return 0.0f; // only to satisfy the compiler
|
1541
1648
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1542
1649
|
}
|
@@ -1567,6 +1674,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|
1567
1674
|
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
1568
1675
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
1569
1676
|
#else
|
1677
|
+
assert(false);
|
1570
1678
|
return 0.0f; // only to satisfy the compiler
|
1571
1679
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1572
1680
|
}
|
@@ -1602,6 +1710,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|
1602
1710
|
|
1603
1711
|
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
1604
1712
|
#else
|
1713
|
+
assert(false);
|
1605
1714
|
return 0.0f; // only to satisfy the compiler
|
1606
1715
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1607
1716
|
}
|
@@ -1639,6 +1748,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|
1639
1748
|
|
1640
1749
|
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
1641
1750
|
#else
|
1751
|
+
assert(false);
|
1642
1752
|
return 0.0f; // only to satisfy the compiler
|
1643
1753
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1644
1754
|
}
|
@@ -1679,6 +1789,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|
1679
1789
|
|
1680
1790
|
return d3 * sumf;
|
1681
1791
|
#else
|
1792
|
+
assert(false);
|
1682
1793
|
return 0.0f; // only to satisfy the compiler
|
1683
1794
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1684
1795
|
}
|
@@ -1704,6 +1815,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|
1704
1815
|
|
1705
1816
|
return d3*d8 * sumi;
|
1706
1817
|
#else
|
1818
|
+
assert(false);
|
1707
1819
|
return 0.0f; // only to satisfy the compiler
|
1708
1820
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1709
1821
|
}
|
@@ -1737,12 +1849,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|
1737
1849
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1738
1850
|
|
1739
1851
|
#else
|
1852
|
+
assert(false);
|
1740
1853
|
return 0.0f; // only to satisfy the compiler
|
1741
1854
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1742
1855
|
}
|
1743
1856
|
|
1744
1857
|
// contiguous u/y values
|
1745
|
-
// also used for q5_K
|
1746
1858
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
1747
1859
|
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1748
1860
|
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
@@ -1752,19 +1864,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1752
1864
|
float sumf_m = 0.0f;
|
1753
1865
|
|
1754
1866
|
#pragma unroll
|
1755
|
-
for (int
|
1867
|
+
for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
|
1756
1868
|
int sumi_d = 0;
|
1757
1869
|
|
1758
1870
|
#pragma unroll
|
1759
|
-
for (int
|
1760
|
-
sumi_d = __dp4a(v[
|
1761
|
-
sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
|
1871
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1872
|
+
sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1762
1873
|
}
|
1763
1874
|
|
1764
|
-
const float2 ds8f = __half22float2(ds8[
|
1875
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1765
1876
|
|
1766
|
-
sumf_d += ds8f.x * (sc[
|
1767
|
-
sumf_m += ds8f.y * m[
|
1877
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1878
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1768
1879
|
}
|
1769
1880
|
|
1770
1881
|
const float2 dm4f = __half22float2(dm4);
|
@@ -1772,6 +1883,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1772
1883
|
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1773
1884
|
|
1774
1885
|
#else
|
1886
|
+
assert(false);
|
1775
1887
|
return 0.0f; // only to satisfy the compiler
|
1776
1888
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1777
1889
|
}
|
@@ -1780,7 +1892,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|
1780
1892
|
#define VDR_Q5_K_Q8_1_MMQ 8
|
1781
1893
|
|
1782
1894
|
// contiguous v/x values
|
1783
|
-
static __device__ __forceinline__ float
|
1895
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
1784
1896
|
const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1785
1897
|
const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
|
1786
1898
|
|
@@ -1812,6 +1924,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
|
|
1812
1924
|
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
1813
1925
|
|
1814
1926
|
#else
|
1927
|
+
assert(false);
|
1928
|
+
return 0.0f; // only to satisfy the compiler
|
1929
|
+
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
// contiguous u/y values
|
1933
|
+
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
1934
|
+
const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
|
1935
|
+
const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
|
1936
|
+
|
1937
|
+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
1938
|
+
float sumf_d = 0.0f;
|
1939
|
+
float sumf_m = 0.0f;
|
1940
|
+
|
1941
|
+
#pragma unroll
|
1942
|
+
for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
|
1943
|
+
int sumi_d = 0;
|
1944
|
+
|
1945
|
+
#pragma unroll
|
1946
|
+
for (int j = 0; j < QI8_1; ++j) {
|
1947
|
+
sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
const float2 ds8f = __half22float2(ds8[i]);
|
1951
|
+
|
1952
|
+
sumf_d += ds8f.x * (sc[i] * sumi_d);
|
1953
|
+
sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
|
1954
|
+
}
|
1955
|
+
|
1956
|
+
const float2 dm4f = __half22float2(dm4);
|
1957
|
+
|
1958
|
+
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
1959
|
+
|
1960
|
+
#else
|
1961
|
+
assert(false);
|
1815
1962
|
return 0.0f; // only to satisfy the compiler
|
1816
1963
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1817
1964
|
}
|
@@ -1842,6 +1989,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|
1842
1989
|
|
1843
1990
|
return d*sumf;
|
1844
1991
|
#else
|
1992
|
+
assert(false);
|
1845
1993
|
return 0.0f; // only to satisfy the compiler
|
1846
1994
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1847
1995
|
}
|
@@ -1873,6 +2021,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|
1873
2021
|
return d6 * sumf_d;
|
1874
2022
|
|
1875
2023
|
#else
|
2024
|
+
assert(false);
|
1876
2025
|
return 0.0f; // only to satisfy the compiler
|
1877
2026
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
1878
2027
|
}
|
@@ -2298,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2298
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2299
2448
|
}
|
2300
2449
|
|
2301
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2302
2451
|
}
|
2303
2452
|
|
2304
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2384,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2384
2533
|
#pragma unroll
|
2385
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2386
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2387
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2388
2537
|
}
|
2389
2538
|
|
2390
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2503,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2503
2652
|
#pragma unroll
|
2504
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2505
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2506
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2507
2656
|
}
|
2508
2657
|
|
2509
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2672,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2672
2821
|
|
2673
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2674
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2675
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2676
2825
|
|
2677
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2678
2827
|
u[2*i+0] = q8[0];
|
@@ -2699,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2699
2848
|
const float dall = bq4_K->d[0];
|
2700
2849
|
const float dmin = bq4_K->d[1];
|
2701
2850
|
|
2702
|
-
const float d8_1 = bq8_1[0].ds
|
2703
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2704
2853
|
|
2705
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2706
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2722,6 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2722
2871
|
return dall * sumf_d - dmin * sumf_m;
|
2723
2872
|
|
2724
2873
|
#else
|
2874
|
+
assert(false);
|
2725
2875
|
return 0.0f; // only to satisfy the compiler
|
2726
2876
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2727
2877
|
|
@@ -2808,18 +2958,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
|
2808
2958
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2809
2959
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2810
2960
|
|
2811
|
-
int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
|
2812
|
-
|
2813
|
-
#pragma unroll
|
2814
|
-
for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
|
2815
|
-
v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
|
2816
|
-
v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
|
2817
|
-
}
|
2818
|
-
|
2819
2961
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
2820
2962
|
|
2821
2963
|
const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
|
2822
|
-
return vec_dot_q4_K_q8_1_impl_mmq(
|
2964
|
+
return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
|
2965
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
|
2823
2966
|
}
|
2824
2967
|
|
2825
2968
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
@@ -2859,14 +3002,14 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2859
3002
|
#pragma unroll
|
2860
3003
|
for (int i = 0; i < QR5_K; ++i) {
|
2861
3004
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2862
|
-
d8[i] = bq8i->ds
|
3005
|
+
d8[i] = __low2float(bq8i->ds);
|
2863
3006
|
|
2864
3007
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2865
3008
|
u[2*i+0] = q8[0];
|
2866
3009
|
u[2*i+1] = q8[4];
|
2867
3010
|
}
|
2868
3011
|
|
2869
|
-
return
|
3012
|
+
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
2870
3013
|
|
2871
3014
|
#else
|
2872
3015
|
|
@@ -2877,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2877
3020
|
|
2878
3021
|
const float d = bq5_K->d;
|
2879
3022
|
|
2880
|
-
const float d8_1 = bq8_1[0].ds
|
2881
|
-
const float d8_2 = bq8_1[1].ds
|
3023
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3024
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2882
3025
|
|
2883
3026
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2884
3027
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2905,6 +3048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2905
3048
|
return d * sumf_d;
|
2906
3049
|
|
2907
3050
|
#else
|
3051
|
+
assert(false);
|
2908
3052
|
return 0.0f; // only to satisfy the compiler
|
2909
3053
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
2910
3054
|
|
@@ -3008,7 +3152,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
|
3008
3152
|
|
3009
3153
|
const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k;
|
3010
3154
|
const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE;
|
3011
|
-
return
|
3155
|
+
return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
|
3156
|
+
x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
|
3012
3157
|
}
|
3013
3158
|
|
3014
3159
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
@@ -3031,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3031
3176
|
#pragma unroll
|
3032
3177
|
for (int i = 0; i < QR6_K; ++i) {
|
3033
3178
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3034
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3179
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3035
3180
|
}
|
3036
3181
|
|
3037
3182
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3135,7 +3280,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
|
3135
3280
|
|
3136
3281
|
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
3137
3282
|
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
3138
|
-
static
|
3283
|
+
static __device__ __forceinline__ void mul_mat_q(
|
3139
3284
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3140
3285
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3141
3286
|
|
@@ -3150,7 +3295,6 @@ static __global__ void mul_mat_q(
|
|
3150
3295
|
|
3151
3296
|
const int row_dst_0 = blockIdx.x*mmq_y;
|
3152
3297
|
const int & row_x_0 = row_dst_0;
|
3153
|
-
const int row_dst = row_dst_0 + threadIdx.x;
|
3154
3298
|
|
3155
3299
|
const int col_dst_0 = blockIdx.y*mmq_x;
|
3156
3300
|
const int & col_y_0 = col_dst_0;
|
@@ -3200,7 +3344,7 @@ static __global__ void mul_mat_q(
|
|
3200
3344
|
*dsi_dst = *dsi_src;
|
3201
3345
|
} else {
|
3202
3346
|
float * dfi_dst = (float *) dsi_dst;
|
3203
|
-
*dfi_dst = (*dsi_src)
|
3347
|
+
*dfi_dst = __low2half(*dsi_src);
|
3204
3348
|
}
|
3205
3349
|
}
|
3206
3350
|
|
@@ -3223,11 +3367,7 @@ static __global__ void mul_mat_q(
|
|
3223
3367
|
}
|
3224
3368
|
}
|
3225
3369
|
|
3226
|
-
|
3227
|
-
if (row_dst >= nrows_dst) {
|
3228
|
-
return;
|
3229
|
-
}
|
3230
|
-
|
3370
|
+
#pragma unroll
|
3231
3371
|
for (int j = 0; j < mmq_x; j += nwarps) {
|
3232
3372
|
const int col_dst = col_dst_0 + j + threadIdx.y;
|
3233
3373
|
|
@@ -3235,12 +3375,375 @@ static __global__ void mul_mat_q(
|
|
3235
3375
|
return;
|
3236
3376
|
}
|
3237
3377
|
|
3378
|
+
#pragma unroll
|
3238
3379
|
for (int i = 0; i < mmq_y; i += WARP_SIZE) {
|
3239
|
-
|
3380
|
+
const int row_dst = row_dst_0 + threadIdx.x + i;
|
3381
|
+
|
3382
|
+
if (row_dst >= nrows_dst) {
|
3383
|
+
continue;
|
3384
|
+
}
|
3385
|
+
|
3386
|
+
dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
|
3240
3387
|
}
|
3241
3388
|
}
|
3242
3389
|
}
|
3243
3390
|
|
3391
|
+
#define MMQ_X_Q4_0_AMPERE 64
|
3392
|
+
#define MMQ_Y_Q4_0_AMPERE 128
|
3393
|
+
#define NWARPS_Q4_0_AMPERE 4
|
3394
|
+
#define MMQ_X_Q4_0_PASCAL 64
|
3395
|
+
#define MMQ_Y_Q4_0_PASCAL 64
|
3396
|
+
#define NWARPS_Q4_0_PASCAL 8
|
3397
|
+
|
3398
|
+
template <bool need_check> static __global__ void mul_mat_q4_0(
|
3399
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3400
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3401
|
+
|
3402
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3403
|
+
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3404
|
+
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3405
|
+
const int nwarps = NWARPS_Q4_0_AMPERE;
|
3406
|
+
|
3407
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3408
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3409
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3410
|
+
|
3411
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3412
|
+
const int mmq_x = MMQ_X_Q4_0_PASCAL;
|
3413
|
+
const int mmq_y = MMQ_Y_Q4_0_PASCAL;
|
3414
|
+
const int nwarps = NWARPS_Q4_0_PASCAL;
|
3415
|
+
|
3416
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3417
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3418
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3419
|
+
#else
|
3420
|
+
(void) vec_dot_q4_0_q8_1_mul_mat;
|
3421
|
+
assert(false);
|
3422
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3423
|
+
}
|
3424
|
+
|
3425
|
+
#define MMQ_X_Q4_1_AMPERE 64
|
3426
|
+
#define MMQ_Y_Q4_1_AMPERE 128
|
3427
|
+
#define NWARPS_Q4_1_AMPERE 4
|
3428
|
+
#define MMQ_X_Q4_1_PASCAL 64
|
3429
|
+
#define MMQ_Y_Q4_1_PASCAL 64
|
3430
|
+
#define NWARPS_Q4_1_PASCAL 8
|
3431
|
+
|
3432
|
+
template <bool need_check> static __global__ void
|
3433
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3434
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3435
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3436
|
+
mul_mat_q4_1(
|
3437
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3438
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3439
|
+
|
3440
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3441
|
+
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3442
|
+
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3443
|
+
const int nwarps = NWARPS_Q4_1_AMPERE;
|
3444
|
+
|
3445
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3446
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3447
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3448
|
+
|
3449
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3450
|
+
const int mmq_x = MMQ_X_Q4_1_PASCAL;
|
3451
|
+
const int mmq_y = MMQ_Y_Q4_1_PASCAL;
|
3452
|
+
const int nwarps = NWARPS_Q4_1_PASCAL;
|
3453
|
+
|
3454
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3455
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3456
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3457
|
+
#else
|
3458
|
+
(void) vec_dot_q4_1_q8_1_mul_mat;
|
3459
|
+
assert(false);
|
3460
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3461
|
+
}
|
3462
|
+
|
3463
|
+
#define MMQ_X_Q5_0_AMPERE 128
|
3464
|
+
#define MMQ_Y_Q5_0_AMPERE 64
|
3465
|
+
#define NWARPS_Q5_0_AMPERE 4
|
3466
|
+
#define MMQ_X_Q5_0_PASCAL 64
|
3467
|
+
#define MMQ_Y_Q5_0_PASCAL 64
|
3468
|
+
#define NWARPS_Q5_0_PASCAL 8
|
3469
|
+
|
3470
|
+
template <bool need_check> static __global__ void mul_mat_q5_0(
|
3471
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3472
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3473
|
+
|
3474
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3475
|
+
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3476
|
+
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3477
|
+
const int nwarps = NWARPS_Q5_0_AMPERE;
|
3478
|
+
|
3479
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3480
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3481
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3482
|
+
|
3483
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3484
|
+
const int mmq_x = MMQ_X_Q5_0_PASCAL;
|
3485
|
+
const int mmq_y = MMQ_Y_Q5_0_PASCAL;
|
3486
|
+
const int nwarps = NWARPS_Q5_0_PASCAL;
|
3487
|
+
|
3488
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3489
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3490
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3491
|
+
#else
|
3492
|
+
(void) vec_dot_q5_0_q8_1_mul_mat;
|
3493
|
+
assert(false);
|
3494
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3495
|
+
}
|
3496
|
+
|
3497
|
+
#define MMQ_X_Q5_1_AMPERE 128
|
3498
|
+
#define MMQ_Y_Q5_1_AMPERE 64
|
3499
|
+
#define NWARPS_Q5_1_AMPERE 4
|
3500
|
+
#define MMQ_X_Q5_1_PASCAL 64
|
3501
|
+
#define MMQ_Y_Q5_1_PASCAL 64
|
3502
|
+
#define NWARPS_Q5_1_PASCAL 8
|
3503
|
+
|
3504
|
+
template <bool need_check> static __global__ void mul_mat_q5_1(
|
3505
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3506
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3507
|
+
|
3508
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3509
|
+
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3510
|
+
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3511
|
+
const int nwarps = NWARPS_Q5_1_AMPERE;
|
3512
|
+
|
3513
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3514
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3516
|
+
|
3517
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3518
|
+
const int mmq_x = MMQ_X_Q5_1_PASCAL;
|
3519
|
+
const int mmq_y = MMQ_Y_Q5_1_PASCAL;
|
3520
|
+
const int nwarps = NWARPS_Q5_1_PASCAL;
|
3521
|
+
|
3522
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3523
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3524
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3525
|
+
#else
|
3526
|
+
(void) vec_dot_q5_1_q8_1_mul_mat;
|
3527
|
+
assert(false);
|
3528
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3529
|
+
}
|
3530
|
+
|
3531
|
+
#define MMQ_X_Q8_0_AMPERE 128
|
3532
|
+
#define MMQ_Y_Q8_0_AMPERE 64
|
3533
|
+
#define NWARPS_Q8_0_AMPERE 4
|
3534
|
+
#define MMQ_X_Q8_0_PASCAL 64
|
3535
|
+
#define MMQ_Y_Q8_0_PASCAL 64
|
3536
|
+
#define NWARPS_Q8_0_PASCAL 8
|
3537
|
+
|
3538
|
+
template <bool need_check> static __global__ void mul_mat_q8_0(
|
3539
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3540
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3541
|
+
|
3542
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3543
|
+
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3544
|
+
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3545
|
+
const int nwarps = NWARPS_Q8_0_AMPERE;
|
3546
|
+
|
3547
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3548
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3549
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3550
|
+
|
3551
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3552
|
+
const int mmq_x = MMQ_X_Q8_0_PASCAL;
|
3553
|
+
const int mmq_y = MMQ_Y_Q8_0_PASCAL;
|
3554
|
+
const int nwarps = NWARPS_Q8_0_PASCAL;
|
3555
|
+
|
3556
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3557
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3558
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3559
|
+
#else
|
3560
|
+
(void) vec_dot_q8_0_q8_1_mul_mat;
|
3561
|
+
assert(false);
|
3562
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3563
|
+
}
|
3564
|
+
|
3565
|
+
#define MMQ_X_Q2_K_AMPERE 64
|
3566
|
+
#define MMQ_Y_Q2_K_AMPERE 128
|
3567
|
+
#define NWARPS_Q2_K_AMPERE 4
|
3568
|
+
#define MMQ_X_Q2_K_PASCAL 64
|
3569
|
+
#define MMQ_Y_Q2_K_PASCAL 64
|
3570
|
+
#define NWARPS_Q2_K_PASCAL 8
|
3571
|
+
|
3572
|
+
template <bool need_check> static __global__ void mul_mat_q2_K(
|
3573
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3574
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3575
|
+
|
3576
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3577
|
+
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3578
|
+
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3579
|
+
const int nwarps = NWARPS_Q2_K_AMPERE;
|
3580
|
+
|
3581
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3582
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3583
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3584
|
+
|
3585
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3586
|
+
const int mmq_x = MMQ_X_Q2_K_PASCAL;
|
3587
|
+
const int mmq_y = MMQ_Y_Q2_K_PASCAL;
|
3588
|
+
const int nwarps = NWARPS_Q2_K_PASCAL;
|
3589
|
+
|
3590
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3591
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3592
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3593
|
+
#else
|
3594
|
+
(void) vec_dot_q2_K_q8_1_mul_mat;
|
3595
|
+
assert(false);
|
3596
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3597
|
+
}
|
3598
|
+
|
3599
|
+
#define MMQ_X_Q3_K_AMPERE 128
|
3600
|
+
#define MMQ_Y_Q3_K_AMPERE 128
|
3601
|
+
#define NWARPS_Q3_K_AMPERE 4
|
3602
|
+
#define MMQ_X_Q3_K_PASCAL 64
|
3603
|
+
#define MMQ_Y_Q3_K_PASCAL 64
|
3604
|
+
#define NWARPS_Q3_K_PASCAL 8
|
3605
|
+
|
3606
|
+
template <bool need_check> static __global__ void
|
3607
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3608
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3609
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3610
|
+
mul_mat_q3_K(
|
3611
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3612
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3613
|
+
|
3614
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3615
|
+
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3616
|
+
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3617
|
+
const int nwarps = NWARPS_Q3_K_AMPERE;
|
3618
|
+
|
3619
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3620
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3621
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3622
|
+
|
3623
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3624
|
+
const int mmq_x = MMQ_X_Q3_K_PASCAL;
|
3625
|
+
const int mmq_y = MMQ_Y_Q3_K_PASCAL;
|
3626
|
+
const int nwarps = NWARPS_Q3_K_PASCAL;
|
3627
|
+
|
3628
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3629
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3631
|
+
#else
|
3632
|
+
(void) vec_dot_q3_K_q8_1_mul_mat;
|
3633
|
+
assert(false);
|
3634
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3635
|
+
}
|
3636
|
+
|
3637
|
+
#define MMQ_X_Q4_K_AMPERE 64
|
3638
|
+
#define MMQ_Y_Q4_K_AMPERE 128
|
3639
|
+
#define NWARPS_Q4_K_AMPERE 4
|
3640
|
+
#define MMQ_X_Q4_K_PASCAL 64
|
3641
|
+
#define MMQ_Y_Q4_K_PASCAL 64
|
3642
|
+
#define NWARPS_Q4_K_PASCAL 8
|
3643
|
+
|
3644
|
+
template <bool need_check> static __global__ void
|
3645
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3646
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3647
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3648
|
+
mul_mat_q4_K(
|
3649
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3650
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3651
|
+
|
3652
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3653
|
+
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3654
|
+
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3655
|
+
const int nwarps = NWARPS_Q4_K_AMPERE;
|
3656
|
+
|
3657
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3658
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3659
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3660
|
+
|
3661
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3662
|
+
const int mmq_x = MMQ_X_Q4_K_PASCAL;
|
3663
|
+
const int mmq_y = MMQ_Y_Q4_K_PASCAL;
|
3664
|
+
const int nwarps = NWARPS_Q4_K_PASCAL;
|
3665
|
+
|
3666
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3667
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3668
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3669
|
+
#else
|
3670
|
+
(void) vec_dot_q4_K_q8_1_mul_mat;
|
3671
|
+
assert(false);
|
3672
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3673
|
+
}
|
3674
|
+
|
3675
|
+
#define MMQ_X_Q5_K_AMPERE 64
|
3676
|
+
#define MMQ_Y_Q5_K_AMPERE 128
|
3677
|
+
#define NWARPS_Q5_K_AMPERE 4
|
3678
|
+
#define MMQ_X_Q5_K_PASCAL 64
|
3679
|
+
#define MMQ_Y_Q5_K_PASCAL 64
|
3680
|
+
#define NWARPS_Q5_K_PASCAL 8
|
3681
|
+
|
3682
|
+
template <bool need_check> static __global__ void mul_mat_q5_K(
|
3683
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3684
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3685
|
+
|
3686
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3687
|
+
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3688
|
+
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3689
|
+
const int nwarps = NWARPS_Q5_K_AMPERE;
|
3690
|
+
|
3691
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3692
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3693
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3694
|
+
|
3695
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3696
|
+
const int mmq_x = MMQ_X_Q5_K_PASCAL;
|
3697
|
+
const int mmq_y = MMQ_Y_Q5_K_PASCAL;
|
3698
|
+
const int nwarps = NWARPS_Q5_K_PASCAL;
|
3699
|
+
|
3700
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
3701
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
3702
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3703
|
+
#else
|
3704
|
+
(void) vec_dot_q5_K_q8_1_mul_mat;
|
3705
|
+
assert(false);
|
3706
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3707
|
+
}
|
3708
|
+
|
3709
|
+
#define MMQ_X_Q6_K_AMPERE 64
|
3710
|
+
#define MMQ_Y_Q6_K_AMPERE 64
|
3711
|
+
#define NWARPS_Q6_K_AMPERE 4
|
3712
|
+
#define MMQ_X_Q6_K_PASCAL 64
|
3713
|
+
#define MMQ_Y_Q6_K_PASCAL 64
|
3714
|
+
#define NWARPS_Q6_K_PASCAL 8
|
3715
|
+
|
3716
|
+
template <bool need_check> static __global__ void
|
3717
|
+
#if __CUDA_ARCH__ < CC_TURING
|
3718
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3719
|
+
#endif // __CUDA_ARCH__ < CC_TURING
|
3720
|
+
mul_mat_q6_K(
|
3721
|
+
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3722
|
+
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3723
|
+
|
3724
|
+
#if __CUDA_ARCH__ >= CC_TURING
|
3725
|
+
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3726
|
+
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3727
|
+
const int nwarps = NWARPS_Q6_K_AMPERE;
|
3728
|
+
|
3729
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3730
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3731
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3732
|
+
|
3733
|
+
#elif __CUDA_ARCH__ >= MIN_CC_DP4A
|
3734
|
+
const int mmq_x = MMQ_X_Q6_K_PASCAL;
|
3735
|
+
const int mmq_y = MMQ_Y_Q6_K_PASCAL;
|
3736
|
+
const int nwarps = NWARPS_Q6_K_PASCAL;
|
3737
|
+
|
3738
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
3739
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
3740
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3741
|
+
#else
|
3742
|
+
(void) vec_dot_q6_K_q8_1_mul_mat;
|
3743
|
+
assert(false);
|
3744
|
+
#endif // __CUDA_ARCH__ >= CC_TURING
|
3745
|
+
}
|
3746
|
+
|
3244
3747
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
3245
3748
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
3246
3749
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -3485,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3485
3988
|
// rope == RoPE == rotary positional embedding
|
3486
3989
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3487
3990
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3488
|
-
const int col = 2*(blockDim.
|
3991
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3489
3992
|
|
3490
3993
|
if (col >= ncols) {
|
3491
3994
|
return;
|
3492
3995
|
}
|
3493
3996
|
|
3494
|
-
const int row = blockDim.
|
3997
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3495
3998
|
const int i = row*ncols + col;
|
3496
3999
|
|
3497
4000
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3505,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3505
4008
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3506
4009
|
}
|
3507
4010
|
|
4011
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4012
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4013
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4014
|
+
|
4015
|
+
if (col >= ncols) {
|
4016
|
+
return;
|
4017
|
+
}
|
4018
|
+
|
4019
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4020
|
+
const int i = row*ncols + col/2;
|
4021
|
+
|
4022
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4023
|
+
const float sin_theta = sinf(theta);
|
4024
|
+
const float cos_theta = cosf(theta);
|
4025
|
+
|
4026
|
+
const float x0 = x[i + 0];
|
4027
|
+
const float x1 = x[i + ncols/2];
|
4028
|
+
|
4029
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4030
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4031
|
+
}
|
4032
|
+
|
3508
4033
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3509
4034
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3510
4035
|
const int half_n_dims = ncols/4;
|
@@ -3539,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3539
4064
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3540
4065
|
}
|
3541
4066
|
|
3542
|
-
static __global__ void
|
4067
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4068
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3543
4069
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4070
|
+
|
4071
|
+
if (col >= ncols) {
|
4072
|
+
return;
|
4073
|
+
}
|
4074
|
+
|
3544
4075
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4076
|
+
const int i = row*ncols + col;
|
4077
|
+
|
4078
|
+
const int k = row/k_rows;
|
4079
|
+
|
4080
|
+
float m_k;
|
4081
|
+
if (k < n_heads_log2_floor) {
|
4082
|
+
m_k = powf(m0, k + 1);
|
4083
|
+
} else {
|
4084
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
dst[i] = col * m_k + x[i];
|
4088
|
+
}
|
4089
|
+
|
4090
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4091
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4092
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3545
4093
|
|
3546
4094
|
if (col >= ncols) {
|
3547
4095
|
return;
|
@@ -3554,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3554
4102
|
|
3555
4103
|
// the CUDA soft max implementation differs from the CPU implementation
|
3556
4104
|
// instead of doubles floats are used
|
3557
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3558
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3559
4105
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3560
|
-
const int row = blockDim.
|
3561
|
-
const int block_size = blockDim.
|
3562
|
-
const int tid = threadIdx.
|
4106
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4107
|
+
const int block_size = blockDim.y;
|
4108
|
+
const int tid = threadIdx.y;
|
3563
4109
|
|
3564
|
-
float
|
4110
|
+
float max_val = -INFINITY;
|
3565
4111
|
|
3566
|
-
for (int
|
3567
|
-
const int
|
4112
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4113
|
+
const int i = row*ncols + col;
|
4114
|
+
max_val = max(max_val, x[i]);
|
4115
|
+
}
|
3568
4116
|
|
3569
|
-
|
3570
|
-
|
3571
|
-
|
4117
|
+
// find the max value in the block
|
4118
|
+
#pragma unroll
|
4119
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4120
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4121
|
+
}
|
3572
4122
|
|
4123
|
+
float tmp = 0.f;
|
4124
|
+
|
4125
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3573
4126
|
const int i = row*ncols + col;
|
3574
|
-
const float val = expf(x[i]);
|
4127
|
+
const float val = expf(x[i] - max_val);
|
3575
4128
|
tmp += val;
|
3576
4129
|
dst[i] = val;
|
3577
4130
|
}
|
@@ -3582,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3582
4135
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3583
4136
|
}
|
3584
4137
|
|
3585
|
-
|
3586
|
-
const int col = block_start + tid;
|
3587
|
-
|
3588
|
-
if (col >= ncols) {
|
3589
|
-
break;
|
3590
|
-
}
|
4138
|
+
const float inv_tmp = 1.f / tmp;
|
3591
4139
|
|
4140
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3592
4141
|
const int i = row*ncols + col;
|
3593
|
-
dst[i]
|
4142
|
+
dst[i] *= inv_tmp;
|
3594
4143
|
}
|
3595
4144
|
}
|
3596
4145
|
|
@@ -3942,48 +4491,32 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
3942
4491
|
CUDA_CHECK(cudaGetDevice(&id));
|
3943
4492
|
const int compute_capability = g_compute_capabilities[id];
|
3944
4493
|
|
4494
|
+
int mmq_x, mmq_y, nwarps;
|
3945
4495
|
if (compute_capability >= CC_TURING) {
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3949
|
-
|
3950
|
-
|
3951
|
-
|
3952
|
-
|
3953
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
3954
|
-
|
3955
|
-
if (nrows_x % mmq_y == 0) {
|
3956
|
-
const bool need_check = false;
|
3957
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3958
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3959
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3960
|
-
} else {
|
3961
|
-
const bool need_check = true;
|
3962
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3963
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3964
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3965
|
-
}
|
4496
|
+
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4497
|
+
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4498
|
+
nwarps = NWARPS_Q4_0_AMPERE;
|
4499
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4500
|
+
mmq_x = MMQ_X_Q4_0_PASCAL;
|
4501
|
+
mmq_y = MMQ_Y_Q4_0_PASCAL;
|
4502
|
+
nwarps = NWARPS_Q4_0_PASCAL;
|
3966
4503
|
} else {
|
3967
|
-
|
3968
|
-
|
3969
|
-
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
3973
|
-
|
3974
|
-
|
3975
|
-
|
3976
|
-
|
3977
|
-
|
3978
|
-
|
3979
|
-
|
3980
|
-
|
3981
|
-
|
3982
|
-
|
3983
|
-
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3984
|
-
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3985
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3986
|
-
}
|
4504
|
+
GGML_ASSERT(false);
|
4505
|
+
}
|
4506
|
+
|
4507
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4508
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4509
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4510
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4511
|
+
|
4512
|
+
if (nrows_x % mmq_y == 0) {
|
4513
|
+
const bool need_check = false;
|
4514
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4515
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4516
|
+
} else {
|
4517
|
+
const bool need_check = true;
|
4518
|
+
mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4519
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3987
4520
|
}
|
3988
4521
|
}
|
3989
4522
|
|
@@ -3995,49 +4528,32 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
3995
4528
|
CUDA_CHECK(cudaGetDevice(&id));
|
3996
4529
|
const int compute_capability = g_compute_capabilities[id];
|
3997
4530
|
|
4531
|
+
int mmq_x, mmq_y, nwarps;
|
3998
4532
|
if (compute_capability >= CC_TURING) {
|
3999
|
-
|
4000
|
-
|
4001
|
-
|
4002
|
-
|
4003
|
-
|
4004
|
-
|
4005
|
-
|
4006
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4007
|
-
|
4008
|
-
if (nrows_x % mmq_y == 0) {
|
4009
|
-
const bool need_check = false;
|
4010
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4011
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4012
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4013
|
-
} else {
|
4014
|
-
const bool need_check = true;
|
4015
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4016
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4017
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4018
|
-
}
|
4533
|
+
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4534
|
+
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4535
|
+
nwarps = NWARPS_Q4_1_AMPERE;
|
4536
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4537
|
+
mmq_x = MMQ_X_Q4_1_PASCAL;
|
4538
|
+
mmq_y = MMQ_Y_Q4_1_PASCAL;
|
4539
|
+
nwarps = NWARPS_Q4_1_PASCAL;
|
4019
4540
|
} else {
|
4020
|
-
|
4021
|
-
|
4022
|
-
const int nwarps = 8;
|
4023
|
-
|
4024
|
-
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4025
|
-
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4026
|
-
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4027
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4028
|
-
|
4029
|
-
if (nrows_x % mmq_y == 0) {
|
4030
|
-
const bool need_check = false;
|
4031
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4032
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4033
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4034
|
-
} else {
|
4035
|
-
const bool need_check = true;
|
4036
|
-
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
4037
|
-
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
4038
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4039
|
-
}
|
4541
|
+
GGML_ASSERT(false);
|
4542
|
+
}
|
4040
4543
|
|
4544
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4545
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4546
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4547
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4548
|
+
|
4549
|
+
if (nrows_x % mmq_y == 0) {
|
4550
|
+
const bool need_check = false;
|
4551
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4552
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4553
|
+
} else {
|
4554
|
+
const bool need_check = true;
|
4555
|
+
mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4556
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4041
4557
|
}
|
4042
4558
|
}
|
4043
4559
|
|
@@ -4049,48 +4565,32 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4049
4565
|
CUDA_CHECK(cudaGetDevice(&id));
|
4050
4566
|
const int compute_capability = g_compute_capabilities[id];
|
4051
4567
|
|
4568
|
+
int mmq_x, mmq_y, nwarps;
|
4052
4569
|
if (compute_capability >= CC_TURING) {
|
4053
|
-
|
4054
|
-
|
4055
|
-
|
4056
|
-
|
4057
|
-
|
4058
|
-
|
4059
|
-
|
4060
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4061
|
-
|
4062
|
-
if (nrows_x % mmq_y == 0) {
|
4063
|
-
const bool need_check = false;
|
4064
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4065
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4066
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4067
|
-
} else {
|
4068
|
-
const bool need_check = true;
|
4069
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4070
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4071
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4072
|
-
}
|
4570
|
+
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4571
|
+
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4572
|
+
nwarps = NWARPS_Q5_0_AMPERE;
|
4573
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4574
|
+
mmq_x = MMQ_X_Q5_0_PASCAL;
|
4575
|
+
mmq_y = MMQ_Y_Q5_0_PASCAL;
|
4576
|
+
nwarps = NWARPS_Q5_0_PASCAL;
|
4073
4577
|
} else {
|
4074
|
-
|
4075
|
-
|
4076
|
-
|
4077
|
-
|
4078
|
-
|
4079
|
-
|
4080
|
-
|
4081
|
-
|
4082
|
-
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4086
|
-
|
4087
|
-
|
4088
|
-
|
4089
|
-
|
4090
|
-
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
4091
|
-
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
4092
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4093
|
-
}
|
4578
|
+
GGML_ASSERT(false);
|
4579
|
+
}
|
4580
|
+
|
4581
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4582
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4583
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4584
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4585
|
+
|
4586
|
+
if (nrows_x % mmq_y == 0) {
|
4587
|
+
const bool need_check = false;
|
4588
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4589
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4590
|
+
} else {
|
4591
|
+
const bool need_check = true;
|
4592
|
+
mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4094
4594
|
}
|
4095
4595
|
}
|
4096
4596
|
|
@@ -4102,48 +4602,32 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4102
4602
|
CUDA_CHECK(cudaGetDevice(&id));
|
4103
4603
|
const int compute_capability = g_compute_capabilities[id];
|
4104
4604
|
|
4605
|
+
int mmq_x, mmq_y, nwarps;
|
4105
4606
|
if (compute_capability >= CC_TURING) {
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
|
4111
|
-
|
4112
|
-
|
4113
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4114
|
-
|
4115
|
-
if (nrows_x % mmq_y == 0) {
|
4116
|
-
const bool need_check = false;
|
4117
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4118
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4119
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4120
|
-
} else {
|
4121
|
-
const bool need_check = true;
|
4122
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4123
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4124
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4125
|
-
}
|
4607
|
+
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4608
|
+
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4609
|
+
nwarps = NWARPS_Q5_1_AMPERE;
|
4610
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4611
|
+
mmq_x = MMQ_X_Q5_1_PASCAL;
|
4612
|
+
mmq_y = MMQ_Y_Q5_1_PASCAL;
|
4613
|
+
nwarps = NWARPS_Q5_1_PASCAL;
|
4126
4614
|
} else {
|
4127
|
-
|
4128
|
-
|
4129
|
-
|
4130
|
-
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4135
|
-
|
4136
|
-
|
4137
|
-
|
4138
|
-
|
4139
|
-
|
4140
|
-
|
4141
|
-
|
4142
|
-
|
4143
|
-
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
4144
|
-
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
4145
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4146
|
-
}
|
4615
|
+
GGML_ASSERT(false);
|
4616
|
+
}
|
4617
|
+
|
4618
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4619
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4620
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4621
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4622
|
+
|
4623
|
+
if (nrows_x % mmq_y == 0) {
|
4624
|
+
const bool need_check = false;
|
4625
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4626
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4627
|
+
} else {
|
4628
|
+
const bool need_check = true;
|
4629
|
+
mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
|
4630
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4147
4631
|
}
|
4148
4632
|
}
|
4149
4633
|
|
@@ -4155,48 +4639,32 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4155
4639
|
CUDA_CHECK(cudaGetDevice(&id));
|
4156
4640
|
const int compute_capability = g_compute_capabilities[id];
|
4157
4641
|
|
4642
|
+
int mmq_x, mmq_y, nwarps;
|
4158
4643
|
if (compute_capability >= CC_TURING) {
|
4159
|
-
|
4160
|
-
|
4161
|
-
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
4166
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4167
|
-
|
4168
|
-
if (nrows_x % mmq_y == 0) {
|
4169
|
-
const bool need_check = false;
|
4170
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4171
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4172
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4173
|
-
} else {
|
4174
|
-
const bool need_check = true;
|
4175
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4176
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4177
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4178
|
-
}
|
4644
|
+
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4645
|
+
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4646
|
+
nwarps = NWARPS_Q8_0_AMPERE;
|
4647
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4648
|
+
mmq_x = MMQ_X_Q8_0_PASCAL;
|
4649
|
+
mmq_y = MMQ_Y_Q8_0_PASCAL;
|
4650
|
+
nwarps = NWARPS_Q8_0_PASCAL;
|
4179
4651
|
} else {
|
4180
|
-
|
4181
|
-
|
4182
|
-
|
4183
|
-
|
4184
|
-
|
4185
|
-
|
4186
|
-
|
4187
|
-
|
4188
|
-
|
4189
|
-
|
4190
|
-
|
4191
|
-
|
4192
|
-
|
4193
|
-
|
4194
|
-
|
4195
|
-
|
4196
|
-
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
4197
|
-
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
4198
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4199
|
-
}
|
4652
|
+
GGML_ASSERT(false);
|
4653
|
+
}
|
4654
|
+
|
4655
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4656
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4657
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4658
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4659
|
+
|
4660
|
+
if (nrows_x % mmq_y == 0) {
|
4661
|
+
const bool need_check = false;
|
4662
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4663
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4664
|
+
} else {
|
4665
|
+
const bool need_check = true;
|
4666
|
+
mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
|
4667
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4200
4668
|
}
|
4201
4669
|
}
|
4202
4670
|
|
@@ -4208,48 +4676,32 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4208
4676
|
CUDA_CHECK(cudaGetDevice(&id));
|
4209
4677
|
const int compute_capability = g_compute_capabilities[id];
|
4210
4678
|
|
4679
|
+
int mmq_x, mmq_y, nwarps;
|
4211
4680
|
if (compute_capability >= CC_TURING) {
|
4212
|
-
|
4213
|
-
|
4214
|
-
|
4215
|
-
|
4216
|
-
|
4217
|
-
|
4218
|
-
|
4219
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4220
|
-
|
4221
|
-
if (nrows_x % mmq_y == 0) {
|
4222
|
-
const bool need_check = false;
|
4223
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4224
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4225
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4226
|
-
} else {
|
4227
|
-
const bool need_check = true;
|
4228
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4229
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4230
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4231
|
-
}
|
4681
|
+
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4682
|
+
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4683
|
+
nwarps = NWARPS_Q2_K_AMPERE;
|
4684
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4685
|
+
mmq_x = MMQ_X_Q2_K_PASCAL;
|
4686
|
+
mmq_y = MMQ_Y_Q2_K_PASCAL;
|
4687
|
+
nwarps = NWARPS_Q2_K_PASCAL;
|
4232
4688
|
} else {
|
4233
|
-
|
4234
|
-
|
4235
|
-
|
4236
|
-
|
4237
|
-
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4242
|
-
|
4243
|
-
|
4244
|
-
|
4245
|
-
|
4246
|
-
|
4247
|
-
|
4248
|
-
|
4249
|
-
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
4250
|
-
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
4251
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4252
|
-
}
|
4689
|
+
GGML_ASSERT(false);
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4693
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4694
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4695
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4696
|
+
|
4697
|
+
if (nrows_x % mmq_y == 0) {
|
4698
|
+
const bool need_check = false;
|
4699
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4700
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4701
|
+
} else {
|
4702
|
+
const bool need_check = true;
|
4703
|
+
mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4704
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4253
4705
|
}
|
4254
4706
|
}
|
4255
4707
|
|
@@ -4261,48 +4713,32 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4261
4713
|
CUDA_CHECK(cudaGetDevice(&id));
|
4262
4714
|
const int compute_capability = g_compute_capabilities[id];
|
4263
4715
|
|
4716
|
+
int mmq_x, mmq_y, nwarps;
|
4264
4717
|
if (compute_capability >= CC_TURING) {
|
4265
|
-
|
4266
|
-
|
4267
|
-
|
4268
|
-
|
4269
|
-
|
4270
|
-
|
4271
|
-
|
4272
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4273
|
-
|
4274
|
-
if (nrows_x % mmq_y == 0) {
|
4275
|
-
const bool need_check = false;
|
4276
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4277
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4278
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4279
|
-
} else {
|
4280
|
-
const bool need_check = true;
|
4281
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4282
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4283
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4284
|
-
}
|
4718
|
+
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4719
|
+
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4720
|
+
nwarps = NWARPS_Q3_K_AMPERE;
|
4721
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4722
|
+
mmq_x = MMQ_X_Q3_K_PASCAL;
|
4723
|
+
mmq_y = MMQ_Y_Q3_K_PASCAL;
|
4724
|
+
nwarps = NWARPS_Q3_K_PASCAL;
|
4285
4725
|
} else {
|
4286
|
-
|
4287
|
-
|
4288
|
-
|
4289
|
-
|
4290
|
-
|
4291
|
-
|
4292
|
-
|
4293
|
-
|
4294
|
-
|
4295
|
-
|
4296
|
-
|
4297
|
-
|
4298
|
-
|
4299
|
-
|
4300
|
-
|
4301
|
-
|
4302
|
-
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
4303
|
-
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
4304
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4305
|
-
}
|
4726
|
+
GGML_ASSERT(false);
|
4727
|
+
}
|
4728
|
+
|
4729
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4730
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4731
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4732
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4733
|
+
|
4734
|
+
if (nrows_x % mmq_y == 0) {
|
4735
|
+
const bool need_check = false;
|
4736
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4737
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4738
|
+
} else {
|
4739
|
+
const bool need_check = true;
|
4740
|
+
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4741
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4306
4742
|
}
|
4307
4743
|
}
|
4308
4744
|
|
@@ -4314,48 +4750,32 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4314
4750
|
CUDA_CHECK(cudaGetDevice(&id));
|
4315
4751
|
const int compute_capability = g_compute_capabilities[id];
|
4316
4752
|
|
4753
|
+
int mmq_x, mmq_y, nwarps;
|
4317
4754
|
if (compute_capability >= CC_TURING) {
|
4318
|
-
|
4319
|
-
|
4320
|
-
|
4321
|
-
|
4322
|
-
|
4323
|
-
|
4324
|
-
|
4325
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4326
|
-
|
4327
|
-
if (nrows_x % mmq_y == 0) {
|
4328
|
-
const bool need_check = false;
|
4329
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4330
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4331
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4332
|
-
} else {
|
4333
|
-
const bool need_check = true;
|
4334
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4335
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4336
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4337
|
-
}
|
4755
|
+
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4756
|
+
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4757
|
+
nwarps = NWARPS_Q4_K_AMPERE;
|
4758
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4759
|
+
mmq_x = MMQ_X_Q4_K_PASCAL;
|
4760
|
+
mmq_y = MMQ_Y_Q4_K_PASCAL;
|
4761
|
+
nwarps = NWARPS_Q4_K_PASCAL;
|
4338
4762
|
} else {
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
|
4355
|
-
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
4356
|
-
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
4357
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4358
|
-
}
|
4763
|
+
GGML_ASSERT(false);
|
4764
|
+
}
|
4765
|
+
|
4766
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4767
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4768
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4769
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4770
|
+
|
4771
|
+
if (nrows_x % mmq_y == 0) {
|
4772
|
+
const bool need_check = false;
|
4773
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4774
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4775
|
+
} else {
|
4776
|
+
const bool need_check = true;
|
4777
|
+
mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4778
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4359
4779
|
}
|
4360
4780
|
}
|
4361
4781
|
|
@@ -4367,48 +4787,32 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4367
4787
|
CUDA_CHECK(cudaGetDevice(&id));
|
4368
4788
|
const int compute_capability = g_compute_capabilities[id];
|
4369
4789
|
|
4790
|
+
int mmq_x, mmq_y, nwarps;
|
4370
4791
|
if (compute_capability >= CC_TURING) {
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4374
|
-
|
4375
|
-
|
4376
|
-
|
4377
|
-
|
4378
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4379
|
-
|
4380
|
-
if (nrows_x % mmq_y == 0) {
|
4381
|
-
const bool need_check = false;
|
4382
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4383
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4384
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4385
|
-
} else {
|
4386
|
-
const bool need_check = true;
|
4387
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4388
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4389
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4390
|
-
}
|
4792
|
+
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4793
|
+
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4794
|
+
nwarps = NWARPS_Q5_K_AMPERE;
|
4795
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4796
|
+
mmq_x = MMQ_X_Q5_K_PASCAL;
|
4797
|
+
mmq_y = MMQ_Y_Q5_K_PASCAL;
|
4798
|
+
nwarps = NWARPS_Q5_K_PASCAL;
|
4391
4799
|
} else {
|
4392
|
-
|
4393
|
-
|
4394
|
-
|
4395
|
-
|
4396
|
-
|
4397
|
-
|
4398
|
-
|
4399
|
-
|
4400
|
-
|
4401
|
-
|
4402
|
-
|
4403
|
-
|
4404
|
-
|
4405
|
-
|
4406
|
-
|
4407
|
-
|
4408
|
-
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4409
|
-
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4410
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4411
|
-
}
|
4800
|
+
GGML_ASSERT(false);
|
4801
|
+
}
|
4802
|
+
|
4803
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4804
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4805
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4806
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4807
|
+
|
4808
|
+
if (nrows_x % mmq_y == 0) {
|
4809
|
+
const bool need_check = false;
|
4810
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4811
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4812
|
+
} else {
|
4813
|
+
const bool need_check = true;
|
4814
|
+
mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4815
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4412
4816
|
}
|
4413
4817
|
}
|
4414
4818
|
|
@@ -4420,48 +4824,32 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4420
4824
|
CUDA_CHECK(cudaGetDevice(&id));
|
4421
4825
|
const int compute_capability = g_compute_capabilities[id];
|
4422
4826
|
|
4827
|
+
int mmq_x, mmq_y, nwarps;
|
4423
4828
|
if (compute_capability >= CC_TURING) {
|
4424
|
-
|
4425
|
-
|
4426
|
-
|
4427
|
-
|
4428
|
-
|
4429
|
-
|
4430
|
-
|
4431
|
-
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4432
|
-
|
4433
|
-
if (nrows_x % mmq_y == 0) {
|
4434
|
-
const bool need_check = false;
|
4435
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4436
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4437
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4438
|
-
} else {
|
4439
|
-
const bool need_check = true;
|
4440
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4441
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4442
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4443
|
-
}
|
4829
|
+
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4830
|
+
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4831
|
+
nwarps = NWARPS_Q6_K_AMPERE;
|
4832
|
+
} else if (compute_capability >= MIN_CC_DP4A) {
|
4833
|
+
mmq_x = MMQ_X_Q6_K_PASCAL;
|
4834
|
+
mmq_y = MMQ_Y_Q6_K_PASCAL;
|
4835
|
+
nwarps = NWARPS_Q6_K_PASCAL;
|
4444
4836
|
} else {
|
4445
|
-
|
4446
|
-
|
4447
|
-
|
4448
|
-
|
4449
|
-
|
4450
|
-
|
4451
|
-
|
4452
|
-
|
4453
|
-
|
4454
|
-
|
4455
|
-
|
4456
|
-
|
4457
|
-
|
4458
|
-
|
4459
|
-
|
4460
|
-
|
4461
|
-
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4462
|
-
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4463
|
-
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4464
|
-
}
|
4837
|
+
GGML_ASSERT(false);
|
4838
|
+
}
|
4839
|
+
|
4840
|
+
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
|
4841
|
+
const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
|
4842
|
+
const dim3 block_nums(block_num_x, block_num_y, 1);
|
4843
|
+
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
4844
|
+
|
4845
|
+
if (nrows_x % mmq_y == 0) {
|
4846
|
+
const bool need_check = false;
|
4847
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4848
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4849
|
+
} else {
|
4850
|
+
const bool need_check = true;
|
4851
|
+
mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4852
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4465
4853
|
}
|
4466
4854
|
}
|
4467
4855
|
|
@@ -4511,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4511
4899
|
|
4512
4900
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4513
4901
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4514
|
-
GGML_ASSERT(nrows % 2 == 0);
|
4515
|
-
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1
|
4902
|
+
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
4903
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4516
4904
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4517
|
-
const dim3 block_nums(
|
4905
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4518
4906
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4519
4907
|
}
|
4520
4908
|
|
4909
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4915
|
+
}
|
4916
|
+
|
4521
4917
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4522
4918
|
GGML_ASSERT(nrows % 4 == 0);
|
4523
4919
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4526,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4526
4922
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4527
4923
|
}
|
4528
4924
|
|
4925
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4926
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4927
|
+
const float m1, cudaStream_t stream) {
|
4928
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4929
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4930
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4931
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4932
|
+
}
|
4933
|
+
|
4529
4934
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4530
|
-
const dim3 block_dims(
|
4935
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4531
4936
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4532
|
-
const dim3 block_nums(
|
4937
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4533
4938
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4534
4939
|
}
|
4535
4940
|
|
4536
4941
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4537
|
-
const dim3 block_dims(
|
4538
|
-
const dim3 block_nums(
|
4942
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4943
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4539
4944
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4540
4945
|
}
|
4541
4946
|
|
@@ -4640,10 +5045,18 @@ void ggml_init_cublas() {
|
|
4640
5045
|
static bool initialized = false;
|
4641
5046
|
|
4642
5047
|
if (!initialized) {
|
5048
|
+
|
5049
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5050
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5051
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5052
|
+
rocblas_initialize();
|
5053
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5054
|
+
#endif
|
5055
|
+
|
4643
5056
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4644
5057
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4645
5058
|
int64_t total_vram = 0;
|
4646
|
-
fprintf(stderr, "%s: found %d
|
5059
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4647
5060
|
for (int id = 0; id < g_device_count; ++id) {
|
4648
5061
|
cudaDeviceProp prop;
|
4649
5062
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5241,7 +5654,8 @@ inline void ggml_cuda_op_rope(
|
|
5241
5654
|
|
5242
5655
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5243
5656
|
|
5244
|
-
const bool
|
5657
|
+
const bool is_neox = mode & 2;
|
5658
|
+
const bool is_glm = mode & 4;
|
5245
5659
|
|
5246
5660
|
// compute
|
5247
5661
|
if (is_glm) {
|
@@ -5249,6 +5663,10 @@ inline void ggml_cuda_op_rope(
|
|
5249
5663
|
const float id_p = min(p, n_ctx - 2.f);
|
5250
5664
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5251
5665
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5666
|
+
} else if (is_neox) {
|
5667
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5668
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5669
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5252
5670
|
} else {
|
5253
5671
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5254
5672
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5261,6 +5679,41 @@ inline void ggml_cuda_op_rope(
|
|
5261
5679
|
(void) i1;
|
5262
5680
|
}
|
5263
5681
|
|
5682
|
+
inline void ggml_cuda_op_alibi(
|
5683
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5684
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5685
|
+
cudaStream_t & cudaStream_main){
|
5686
|
+
|
5687
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5688
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5689
|
+
|
5690
|
+
const int64_t ne00 = src0->ne[0];
|
5691
|
+
const int64_t ne01 = src0->ne[1];
|
5692
|
+
const int64_t ne02 = src0->ne[2];
|
5693
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5694
|
+
|
5695
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5696
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5697
|
+
float max_bias;
|
5698
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5699
|
+
|
5700
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5701
|
+
GGML_ASSERT(n_head == ne02);
|
5702
|
+
|
5703
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5704
|
+
|
5705
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5706
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5707
|
+
|
5708
|
+
// compute
|
5709
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5710
|
+
|
5711
|
+
(void) src1;
|
5712
|
+
(void) src0_ddq_i;
|
5713
|
+
(void) src1_ddf_i;
|
5714
|
+
(void) i1;
|
5715
|
+
}
|
5716
|
+
|
5264
5717
|
inline void ggml_cuda_op_diag_mask_inf(
|
5265
5718
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5266
5719
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -5881,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|
5881
6334
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
5882
6335
|
}
|
5883
6336
|
|
6337
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6338
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6339
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6340
|
+
}
|
6341
|
+
|
5884
6342
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
5885
6343
|
(void) src0;
|
5886
6344
|
(void) src1;
|
@@ -6000,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6000
6458
|
return extra;
|
6001
6459
|
}
|
6002
6460
|
|
6003
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6461
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6004
6462
|
if (scratch && g_scratch_size == 0) {
|
6005
6463
|
return;
|
6006
6464
|
}
|
@@ -6009,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6009
6467
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6010
6468
|
const ggml_op src0_op = tensor->src[0]->op;
|
6011
6469
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6012
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6470
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6013
6471
|
}
|
6014
6472
|
}
|
6015
6473
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6016
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6474
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6017
6475
|
}
|
6018
6476
|
|
6019
6477
|
tensor->backend = GGML_BACKEND_GPU;
|
6478
|
+
|
6479
|
+
if (scratch && no_alloc) {
|
6480
|
+
return;
|
6481
|
+
}
|
6482
|
+
|
6020
6483
|
struct ggml_tensor_extra_gpu * extra;
|
6021
6484
|
|
6022
6485
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6068,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6068
6531
|
tensor->extra = extra;
|
6069
6532
|
}
|
6070
6533
|
|
6534
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6535
|
+
if (g_scratch_size == 0) {
|
6536
|
+
return;
|
6537
|
+
}
|
6538
|
+
if (g_scratch_buffer == nullptr) {
|
6539
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6540
|
+
}
|
6541
|
+
|
6542
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6543
|
+
|
6544
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6545
|
+
tensor->op == GGML_OP_VIEW;
|
6546
|
+
|
6547
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6548
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6549
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6550
|
+
size_t view_offset = 0;
|
6551
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6552
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6553
|
+
}
|
6554
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6555
|
+
} else {
|
6556
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6557
|
+
}
|
6558
|
+
|
6559
|
+
tensor->extra = extra;
|
6560
|
+
}
|
6561
|
+
|
6071
6562
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6072
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6563
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6564
|
+
}
|
6565
|
+
|
6566
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6567
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6073
6568
|
}
|
6074
6569
|
|
6075
6570
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6076
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6571
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6077
6572
|
}
|
6078
6573
|
|
6079
6574
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6080
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6081
6576
|
}
|
6082
6577
|
|
6083
6578
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6216,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6216
6711
|
}
|
6217
6712
|
func = ggml_cuda_rope;
|
6218
6713
|
break;
|
6714
|
+
case GGML_OP_ALIBI:
|
6715
|
+
if (!any_on_device) {
|
6716
|
+
return false;
|
6717
|
+
}
|
6718
|
+
func = ggml_cuda_alibi;
|
6719
|
+
break;
|
6219
6720
|
default:
|
6220
6721
|
return false;
|
6221
6722
|
}
|
@@ -6229,3 +6730,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6229
6730
|
func(tensor->src[0], tensor->src[1], tensor);
|
6230
6731
|
return true;
|
6231
6732
|
}
|
6733
|
+
|
6734
|
+
int ggml_cuda_get_device_count() {
|
6735
|
+
int device_count;
|
6736
|
+
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
6737
|
+
return device_count;
|
6738
|
+
}
|
6739
|
+
|
6740
|
+
void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
6741
|
+
cudaDeviceProp prop;
|
6742
|
+
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
6743
|
+
snprintf(description, description_size, "%s", prop.name);
|
6744
|
+
}
|