llama_cpp 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -13,7 +13,7 @@
|
|
13
13
|
#ifdef __HIP_PLATFORM_AMD__
|
14
14
|
// for rocblas_initialize()
|
15
15
|
#include "rocblas/rocblas.h"
|
16
|
-
#endif
|
16
|
+
#endif // __HIP_PLATFORM_AMD__
|
17
17
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
18
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
19
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
@@ -31,6 +31,9 @@
|
|
31
31
|
#define cublasSetStream hipblasSetStream
|
32
32
|
#define cublasSgemm hipblasSgemm
|
33
33
|
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
35
|
+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
36
|
+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
34
37
|
#define cudaDeviceProp hipDeviceProp_t
|
35
38
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
39
|
#define cudaError_t hipError_t
|
@@ -61,26 +64,36 @@
|
|
61
64
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
65
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
66
|
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
-
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event,
|
67
|
+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
65
68
|
#define cudaStream_t hipStream_t
|
66
69
|
#define cudaSuccess hipSuccess
|
67
70
|
#else
|
68
71
|
#include <cuda_runtime.h>
|
69
72
|
#include <cublas_v2.h>
|
70
73
|
#include <cuda_fp16.h>
|
71
|
-
#endif
|
74
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
72
75
|
|
73
76
|
#include "ggml-cuda.h"
|
74
77
|
#include "ggml.h"
|
75
78
|
|
76
|
-
#define MIN_CC_DP4A
|
77
|
-
#
|
78
|
-
#define
|
79
|
-
#
|
79
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
80
|
+
#define CC_TURING 700
|
81
|
+
#define CC_OFFSET_AMD 1000000
|
82
|
+
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
80
83
|
|
81
84
|
#if defined(GGML_USE_HIPBLAS)
|
82
85
|
#define __CUDA_ARCH__ 1300
|
83
86
|
|
87
|
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
88
|
+
defined(__gfx1150__) || defined(__gfx1151__)
|
89
|
+
#define RDNA3
|
90
|
+
#endif
|
91
|
+
|
92
|
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
93
|
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
94
|
+
#define RDNA2
|
95
|
+
#endif
|
96
|
+
|
84
97
|
#ifndef __has_builtin
|
85
98
|
#define __has_builtin(x) 0
|
86
99
|
#endif
|
@@ -132,7 +145,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
132
145
|
#endif
|
133
146
|
return c;
|
134
147
|
}
|
135
|
-
#endif
|
148
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
136
149
|
|
137
150
|
#if defined(_MSC_VER)
|
138
151
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -144,8 +157,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
144
157
|
do { \
|
145
158
|
cudaError_t err_ = (err); \
|
146
159
|
if (err_ != cudaSuccess) { \
|
147
|
-
|
160
|
+
int id; \
|
161
|
+
cudaGetDevice(&id); \
|
162
|
+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
148
163
|
cudaGetErrorString(err_)); \
|
164
|
+
fprintf(stderr, "current device: %d\n", id); \
|
149
165
|
exit(1); \
|
150
166
|
} \
|
151
167
|
} while (0)
|
@@ -155,8 +171,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
155
171
|
do { \
|
156
172
|
cublasStatus_t err_ = (err); \
|
157
173
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
174
|
+
int id; \
|
175
|
+
cudaGetDevice(&id); \
|
158
176
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
159
177
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
178
|
+
fprintf(stderr, "current device: %d\n", id); \
|
160
179
|
exit(1); \
|
161
180
|
} \
|
162
181
|
} while (0)
|
@@ -165,12 +184,21 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
165
184
|
do { \
|
166
185
|
cublasStatus_t err_ = (err); \
|
167
186
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
187
|
+
int id; \
|
188
|
+
cudaGetDevice(&id); \
|
168
189
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
190
|
+
fprintf(stderr, "current device: %d\n", id); \
|
169
191
|
exit(1); \
|
170
192
|
} \
|
171
193
|
} while (0)
|
172
194
|
#endif // CUDART_VERSION >= 11
|
173
195
|
|
196
|
+
#if CUDART_VERSION >= 11100
|
197
|
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
198
|
+
#else
|
199
|
+
#define GGML_CUDA_ASSUME(x)
|
200
|
+
#endif // CUDART_VERSION >= 11100
|
201
|
+
|
174
202
|
#ifdef GGML_CUDA_F16
|
175
203
|
typedef half dfloat; // dequantize float
|
176
204
|
typedef half2 dfloat2;
|
@@ -212,10 +240,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
|
|
212
240
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
213
241
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
214
242
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
215
|
-
typedef void (*
|
216
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
217
|
-
|
218
|
-
cudaStream_t &
|
243
|
+
typedef void (*ggml_cuda_op_mul_mat_t)(
|
244
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
245
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
246
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream);
|
247
|
+
typedef void (*ggml_cuda_op_flatten_t)(
|
248
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
249
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
|
219
250
|
|
220
251
|
// QK = number of values after dequantization
|
221
252
|
// QR = QK / number of values before dequantization
|
@@ -396,11 +427,33 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
396
427
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
397
428
|
#endif
|
398
429
|
|
430
|
+
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
431
|
+
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
432
|
+
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
433
|
+
|
434
|
+
#define MUL_MAT_SRC1_COL_STRIDE 128
|
435
|
+
|
436
|
+
#define MAX_STREAMS 8
|
437
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
438
|
+
|
399
439
|
struct ggml_tensor_extra_gpu {
|
400
440
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
401
|
-
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
441
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
|
402
442
|
};
|
403
443
|
|
444
|
+
// this is faster on Windows
|
445
|
+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
446
|
+
inline cudaError_t ggml_cuda_set_device(const int device) {
|
447
|
+
int current_device;
|
448
|
+
CUDA_CHECK(cudaGetDevice(¤t_device));
|
449
|
+
|
450
|
+
if (device == current_device) {
|
451
|
+
return cudaSuccess;
|
452
|
+
}
|
453
|
+
|
454
|
+
return cudaSetDevice(device);
|
455
|
+
}
|
456
|
+
|
404
457
|
static int g_device_count = -1;
|
405
458
|
static int g_main_device = 0;
|
406
459
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
@@ -413,8 +466,6 @@ static size_t g_scratch_offset = 0;
|
|
413
466
|
|
414
467
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
415
468
|
|
416
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
417
|
-
|
418
469
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
419
470
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
420
471
|
|
@@ -2107,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2107
2158
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2108
2159
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2109
2160
|
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2161
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2162
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2163
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2164
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2114
2165
|
|
2115
2166
|
const int kbx = k / QI4_0;
|
2116
2167
|
const int kqsx = k % QI4_0;
|
@@ -2201,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2201
2252
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2202
2253
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2203
2254
|
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
2207
|
-
|
2255
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2256
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2257
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2258
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2208
2259
|
|
2209
2260
|
const int kbx = k / QI4_1;
|
2210
2261
|
const int kqsx = k % QI4_1;
|
@@ -2293,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2293
2344
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2294
2345
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2295
2346
|
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2347
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2348
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2349
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2350
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2300
2351
|
|
2301
2352
|
const int kbx = k / QI5_0;
|
2302
2353
|
const int kqsx = k % QI5_0;
|
@@ -2407,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2407
2458
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2408
2459
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2409
2460
|
|
2410
|
-
|
2411
|
-
|
2412
|
-
|
2413
|
-
|
2461
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2462
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2463
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2464
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2414
2465
|
|
2415
2466
|
const int kbx = k / QI5_1;
|
2416
2467
|
const int kqsx = k % QI5_1;
|
@@ -2513,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2513
2564
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2514
2565
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2515
2566
|
|
2516
|
-
|
2517
|
-
|
2518
|
-
|
2519
|
-
|
2567
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2568
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2569
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2570
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2520
2571
|
|
2521
2572
|
const int kbx = k / QI8_0;
|
2522
2573
|
const int kqsx = k % QI8_0;
|
@@ -2604,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2604
2655
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2605
2656
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2606
2657
|
|
2607
|
-
|
2608
|
-
|
2609
|
-
|
2610
|
-
|
2658
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2659
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2660
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2661
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2611
2662
|
|
2612
2663
|
const int kbx = k / QI2_K;
|
2613
2664
|
const int kqsx = k % QI2_K;
|
@@ -2725,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2725
2776
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2726
2777
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2727
2778
|
|
2728
|
-
|
2729
|
-
|
2730
|
-
|
2731
|
-
|
2779
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2780
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2781
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2782
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2732
2783
|
|
2733
2784
|
const int kbx = k / QI3_K;
|
2734
2785
|
const int kqsx = k % QI3_K;
|
@@ -2943,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2943
2994
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2944
2995
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2945
2996
|
|
2946
|
-
|
2947
|
-
|
2948
|
-
|
2949
|
-
|
2997
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2998
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2999
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3000
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2950
3001
|
|
2951
3002
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2952
3003
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
@@ -3124,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3124
3175
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3125
3176
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3126
3177
|
|
3127
|
-
|
3128
|
-
|
3129
|
-
|
3130
|
-
|
3178
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3179
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3180
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3181
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3131
3182
|
|
3132
3183
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3133
3184
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
@@ -3253,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3253
3304
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3254
3305
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3255
3306
|
|
3256
|
-
|
3257
|
-
|
3258
|
-
|
3259
|
-
|
3307
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3308
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3309
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3310
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3260
3311
|
|
3261
3312
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3262
3313
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
@@ -3444,6 +3495,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3444
3495
|
}
|
3445
3496
|
}
|
3446
3497
|
|
3498
|
+
#define MMQ_X_Q4_0_RDNA2 64
|
3499
|
+
#define MMQ_Y_Q4_0_RDNA2 128
|
3500
|
+
#define NWARPS_Q4_0_RDNA2 8
|
3501
|
+
#define MMQ_X_Q4_0_RDNA1 64
|
3502
|
+
#define MMQ_Y_Q4_0_RDNA1 64
|
3503
|
+
#define NWARPS_Q4_0_RDNA1 8
|
3447
3504
|
#define MMQ_X_Q4_0_AMPERE 64
|
3448
3505
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3449
3506
|
#define NWARPS_Q4_0_AMPERE 4
|
@@ -3451,11 +3508,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3451
3508
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3452
3509
|
#define NWARPS_Q4_0_PASCAL 8
|
3453
3510
|
|
3454
|
-
template <bool need_check> static __global__ void
|
3511
|
+
template <bool need_check> static __global__ void
|
3512
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3513
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3514
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
3515
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3516
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3517
|
+
mul_mat_q4_0(
|
3455
3518
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3456
3519
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3457
3520
|
|
3458
|
-
#if
|
3521
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3522
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3523
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
3524
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
3525
|
+
const int nwarps = NWARPS_Q4_0_RDNA2;
|
3526
|
+
#else
|
3527
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
3528
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
3529
|
+
const int nwarps = NWARPS_Q4_0_RDNA1;
|
3530
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3531
|
+
|
3532
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3533
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3534
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3535
|
+
|
3536
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3459
3537
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3460
3538
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3461
3539
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3478,6 +3556,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3478
3556
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3479
3557
|
}
|
3480
3558
|
|
3559
|
+
#define MMQ_X_Q4_1_RDNA2 64
|
3560
|
+
#define MMQ_Y_Q4_1_RDNA2 128
|
3561
|
+
#define NWARPS_Q4_1_RDNA2 8
|
3562
|
+
#define MMQ_X_Q4_1_RDNA1 64
|
3563
|
+
#define MMQ_Y_Q4_1_RDNA1 64
|
3564
|
+
#define NWARPS_Q4_1_RDNA1 8
|
3481
3565
|
#define MMQ_X_Q4_1_AMPERE 64
|
3482
3566
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3483
3567
|
#define NWARPS_Q4_1_AMPERE 4
|
@@ -3486,14 +3570,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3486
3570
|
#define NWARPS_Q4_1_PASCAL 8
|
3487
3571
|
|
3488
3572
|
template <bool need_check> static __global__ void
|
3489
|
-
#if
|
3573
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3574
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3575
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3576
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3577
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3490
3578
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3491
3579
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3492
3580
|
mul_mat_q4_1(
|
3493
3581
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3494
3582
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3495
3583
|
|
3496
|
-
#if
|
3584
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3585
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3586
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
3587
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
3588
|
+
const int nwarps = NWARPS_Q4_1_RDNA2;
|
3589
|
+
#else
|
3590
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
3591
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
3592
|
+
const int nwarps = NWARPS_Q4_1_RDNA1;
|
3593
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3594
|
+
|
3595
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3596
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3597
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3598
|
+
|
3599
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3497
3600
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3498
3601
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3499
3602
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3516,6 +3619,12 @@ template <bool need_check> static __global__ void
|
|
3516
3619
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3517
3620
|
}
|
3518
3621
|
|
3622
|
+
#define MMQ_X_Q5_0_RDNA2 64
|
3623
|
+
#define MMQ_Y_Q5_0_RDNA2 128
|
3624
|
+
#define NWARPS_Q5_0_RDNA2 8
|
3625
|
+
#define MMQ_X_Q5_0_RDNA1 64
|
3626
|
+
#define MMQ_Y_Q5_0_RDNA1 64
|
3627
|
+
#define NWARPS_Q5_0_RDNA1 8
|
3519
3628
|
#define MMQ_X_Q5_0_AMPERE 128
|
3520
3629
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3521
3630
|
#define NWARPS_Q5_0_AMPERE 4
|
@@ -3523,11 +3632,32 @@ template <bool need_check> static __global__ void
|
|
3523
3632
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3524
3633
|
#define NWARPS_Q5_0_PASCAL 8
|
3525
3634
|
|
3526
|
-
template <bool need_check> static __global__ void
|
3635
|
+
template <bool need_check> static __global__ void
|
3636
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3637
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3638
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
3639
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3640
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3641
|
+
mul_mat_q5_0(
|
3527
3642
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3528
3643
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3529
3644
|
|
3530
|
-
#if
|
3645
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3646
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3647
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
3648
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
3649
|
+
const int nwarps = NWARPS_Q5_0_RDNA2;
|
3650
|
+
#else
|
3651
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
3652
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
3653
|
+
const int nwarps = NWARPS_Q5_0_RDNA1;
|
3654
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3655
|
+
|
3656
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3657
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3658
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3659
|
+
|
3660
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3531
3661
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3532
3662
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3533
3663
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3550,6 +3680,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3550
3680
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3551
3681
|
}
|
3552
3682
|
|
3683
|
+
#define MMQ_X_Q5_1_RDNA2 64
|
3684
|
+
#define MMQ_Y_Q5_1_RDNA2 128
|
3685
|
+
#define NWARPS_Q5_1_RDNA2 8
|
3686
|
+
#define MMQ_X_Q5_1_RDNA1 64
|
3687
|
+
#define MMQ_Y_Q5_1_RDNA1 64
|
3688
|
+
#define NWARPS_Q5_1_RDNA1 8
|
3553
3689
|
#define MMQ_X_Q5_1_AMPERE 128
|
3554
3690
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3555
3691
|
#define NWARPS_Q5_1_AMPERE 4
|
@@ -3557,11 +3693,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3557
3693
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3558
3694
|
#define NWARPS_Q5_1_PASCAL 8
|
3559
3695
|
|
3560
|
-
template <bool need_check> static __global__ void
|
3696
|
+
template <bool need_check> static __global__ void
|
3697
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3698
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3699
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
3700
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3701
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3702
|
+
mul_mat_q5_1(
|
3561
3703
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3562
3704
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3563
3705
|
|
3564
|
-
#if
|
3706
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3707
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3708
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
3709
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
3710
|
+
const int nwarps = NWARPS_Q5_1_RDNA2;
|
3711
|
+
#else
|
3712
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
3713
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
3714
|
+
const int nwarps = NWARPS_Q5_1_RDNA1;
|
3715
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3716
|
+
|
3717
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3718
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3719
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3720
|
+
|
3721
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3565
3722
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3566
3723
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3567
3724
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3584,6 +3741,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3584
3741
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3585
3742
|
}
|
3586
3743
|
|
3744
|
+
#define MMQ_X_Q8_0_RDNA2 64
|
3745
|
+
#define MMQ_Y_Q8_0_RDNA2 128
|
3746
|
+
#define NWARPS_Q8_0_RDNA2 8
|
3747
|
+
#define MMQ_X_Q8_0_RDNA1 64
|
3748
|
+
#define MMQ_Y_Q8_0_RDNA1 64
|
3749
|
+
#define NWARPS_Q8_0_RDNA1 8
|
3587
3750
|
#define MMQ_X_Q8_0_AMPERE 128
|
3588
3751
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3589
3752
|
#define NWARPS_Q8_0_AMPERE 4
|
@@ -3591,11 +3754,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3591
3754
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3592
3755
|
#define NWARPS_Q8_0_PASCAL 8
|
3593
3756
|
|
3594
|
-
template <bool need_check> static __global__ void
|
3757
|
+
template <bool need_check> static __global__ void
|
3758
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3759
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3760
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
3761
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3762
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3763
|
+
mul_mat_q8_0(
|
3595
3764
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3596
3765
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3597
3766
|
|
3598
|
-
#if
|
3767
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3768
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3769
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
3770
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
3771
|
+
const int nwarps = NWARPS_Q8_0_RDNA2;
|
3772
|
+
#else
|
3773
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
3774
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
3775
|
+
const int nwarps = NWARPS_Q8_0_RDNA1;
|
3776
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3777
|
+
|
3778
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3779
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3780
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3781
|
+
|
3782
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3599
3783
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3600
3784
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3601
3785
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3618,6 +3802,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3618
3802
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3619
3803
|
}
|
3620
3804
|
|
3805
|
+
#define MMQ_X_Q2_K_RDNA2 64
|
3806
|
+
#define MMQ_Y_Q2_K_RDNA2 128
|
3807
|
+
#define NWARPS_Q2_K_RDNA2 8
|
3808
|
+
#define MMQ_X_Q2_K_RDNA1 128
|
3809
|
+
#define MMQ_Y_Q2_K_RDNA1 32
|
3810
|
+
#define NWARPS_Q2_K_RDNA1 8
|
3621
3811
|
#define MMQ_X_Q2_K_AMPERE 64
|
3622
3812
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3623
3813
|
#define NWARPS_Q2_K_AMPERE 4
|
@@ -3625,11 +3815,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3625
3815
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3626
3816
|
#define NWARPS_Q2_K_PASCAL 8
|
3627
3817
|
|
3628
|
-
template <bool need_check> static __global__ void
|
3818
|
+
template <bool need_check> static __global__ void
|
3819
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3820
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3821
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
3822
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3823
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3824
|
+
mul_mat_q2_K(
|
3629
3825
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3630
3826
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3631
3827
|
|
3632
|
-
#if
|
3828
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3829
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3830
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
3831
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
3832
|
+
const int nwarps = NWARPS_Q2_K_RDNA2;
|
3833
|
+
#else
|
3834
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
3835
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
3836
|
+
const int nwarps = NWARPS_Q2_K_RDNA1;
|
3837
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3838
|
+
|
3839
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3840
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3841
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3842
|
+
|
3843
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3633
3844
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3634
3845
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3635
3846
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3652,6 +3863,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3652
3863
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3653
3864
|
}
|
3654
3865
|
|
3866
|
+
#define MMQ_X_Q3_K_RDNA2 128
|
3867
|
+
#define MMQ_Y_Q3_K_RDNA2 64
|
3868
|
+
#define NWARPS_Q3_K_RDNA2 8
|
3869
|
+
#define MMQ_X_Q3_K_RDNA1 32
|
3870
|
+
#define MMQ_Y_Q3_K_RDNA1 128
|
3871
|
+
#define NWARPS_Q3_K_RDNA1 8
|
3655
3872
|
#define MMQ_X_Q3_K_AMPERE 128
|
3656
3873
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3657
3874
|
#define NWARPS_Q3_K_AMPERE 4
|
@@ -3660,14 +3877,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3660
3877
|
#define NWARPS_Q3_K_PASCAL 8
|
3661
3878
|
|
3662
3879
|
template <bool need_check> static __global__ void
|
3663
|
-
#if
|
3880
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3881
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3882
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3883
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3884
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3664
3885
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3665
3886
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3666
3887
|
mul_mat_q3_K(
|
3667
3888
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3668
3889
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3669
3890
|
|
3670
|
-
#if
|
3891
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3892
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3893
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
3894
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
3895
|
+
const int nwarps = NWARPS_Q3_K_RDNA2;
|
3896
|
+
#else
|
3897
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
3898
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
3899
|
+
const int nwarps = NWARPS_Q3_K_RDNA1;
|
3900
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3901
|
+
|
3902
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3903
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3904
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3905
|
+
|
3906
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3671
3907
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3672
3908
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3673
3909
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3690,6 +3926,12 @@ template <bool need_check> static __global__ void
|
|
3690
3926
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3691
3927
|
}
|
3692
3928
|
|
3929
|
+
#define MMQ_X_Q4_K_RDNA2 64
|
3930
|
+
#define MMQ_Y_Q4_K_RDNA2 128
|
3931
|
+
#define NWARPS_Q4_K_RDNA2 8
|
3932
|
+
#define MMQ_X_Q4_K_RDNA1 32
|
3933
|
+
#define MMQ_Y_Q4_K_RDNA1 64
|
3934
|
+
#define NWARPS_Q4_K_RDNA1 8
|
3693
3935
|
#define MMQ_X_Q4_K_AMPERE 64
|
3694
3936
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3695
3937
|
#define NWARPS_Q4_K_AMPERE 4
|
@@ -3698,14 +3940,33 @@ template <bool need_check> static __global__ void
|
|
3698
3940
|
#define NWARPS_Q4_K_PASCAL 8
|
3699
3941
|
|
3700
3942
|
template <bool need_check> static __global__ void
|
3701
|
-
#if
|
3943
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3944
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3945
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3946
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3947
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3702
3948
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3703
3949
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3704
3950
|
mul_mat_q4_K(
|
3705
3951
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3706
3952
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3707
3953
|
|
3708
|
-
#if
|
3954
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3955
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3956
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
3957
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
3958
|
+
const int nwarps = NWARPS_Q4_K_RDNA2;
|
3959
|
+
#else
|
3960
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
3961
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
3962
|
+
const int nwarps = NWARPS_Q4_K_RDNA1;
|
3963
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3964
|
+
|
3965
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3966
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3967
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3968
|
+
|
3969
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3709
3970
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3710
3971
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3711
3972
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3728,6 +3989,12 @@ template <bool need_check> static __global__ void
|
|
3728
3989
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3729
3990
|
}
|
3730
3991
|
|
3992
|
+
#define MMQ_X_Q5_K_RDNA2 64
|
3993
|
+
#define MMQ_Y_Q5_K_RDNA2 128
|
3994
|
+
#define NWARPS_Q5_K_RDNA2 8
|
3995
|
+
#define MMQ_X_Q5_K_RDNA1 32
|
3996
|
+
#define MMQ_Y_Q5_K_RDNA1 64
|
3997
|
+
#define NWARPS_Q5_K_RDNA1 8
|
3731
3998
|
#define MMQ_X_Q5_K_AMPERE 64
|
3732
3999
|
#define MMQ_Y_Q5_K_AMPERE 128
|
3733
4000
|
#define NWARPS_Q5_K_AMPERE 4
|
@@ -3735,11 +4002,32 @@ template <bool need_check> static __global__ void
|
|
3735
4002
|
#define MMQ_Y_Q5_K_PASCAL 64
|
3736
4003
|
#define NWARPS_Q5_K_PASCAL 8
|
3737
4004
|
|
3738
|
-
template <bool need_check> static __global__ void
|
4005
|
+
template <bool need_check> static __global__ void
|
4006
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4007
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4008
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
4009
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4010
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4011
|
+
mul_mat_q5_K(
|
3739
4012
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3740
4013
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3741
4014
|
|
3742
|
-
#if
|
4015
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4016
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4017
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
4018
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
4019
|
+
const int nwarps = NWARPS_Q5_K_RDNA2;
|
4020
|
+
#else
|
4021
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
4022
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
4023
|
+
const int nwarps = NWARPS_Q5_K_RDNA1;
|
4024
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4025
|
+
|
4026
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4027
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4028
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4029
|
+
|
4030
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3743
4031
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3744
4032
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3745
4033
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -3762,6 +4050,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3762
4050
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3763
4051
|
}
|
3764
4052
|
|
4053
|
+
#define MMQ_X_Q6_K_RDNA2 64
|
4054
|
+
#define MMQ_Y_Q6_K_RDNA2 128
|
4055
|
+
#define NWARPS_Q6_K_RDNA2 8
|
4056
|
+
#define MMQ_X_Q6_K_RDNA1 32
|
4057
|
+
#define MMQ_Y_Q6_K_RDNA1 64
|
4058
|
+
#define NWARPS_Q6_K_RDNA1 8
|
3765
4059
|
#define MMQ_X_Q6_K_AMPERE 64
|
3766
4060
|
#define MMQ_Y_Q6_K_AMPERE 64
|
3767
4061
|
#define NWARPS_Q6_K_AMPERE 4
|
@@ -3770,14 +4064,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3770
4064
|
#define NWARPS_Q6_K_PASCAL 8
|
3771
4065
|
|
3772
4066
|
template <bool need_check> static __global__ void
|
3773
|
-
#if
|
4067
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4068
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4069
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4070
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4071
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3774
4072
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3775
4073
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3776
4074
|
mul_mat_q6_K(
|
3777
4075
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3778
4076
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3779
4077
|
|
3780
|
-
#if
|
4078
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4079
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4080
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
4081
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
4082
|
+
const int nwarps = NWARPS_Q6_K_RDNA2;
|
4083
|
+
#else
|
4084
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
4085
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
4086
|
+
const int nwarps = NWARPS_Q6_K_RDNA1;
|
4087
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4088
|
+
|
4089
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4090
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4091
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4092
|
+
|
4093
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3781
4094
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3782
4095
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3783
4096
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4086,7 +4399,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4086
4399
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4087
4400
|
}
|
4088
4401
|
|
4089
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4402
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
4403
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4090
4404
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4091
4405
|
const int half_n_dims = ncols/4;
|
4092
4406
|
|
@@ -4098,8 +4412,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4098
4412
|
const int i = row*ncols + col;
|
4099
4413
|
|
4100
4414
|
const float col_theta_scale = powf(theta_scale, col);
|
4415
|
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
4101
4416
|
|
4102
|
-
const float theta = p*col_theta_scale;
|
4417
|
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
4103
4418
|
const float sin_theta = sinf(theta);
|
4104
4419
|
const float cos_theta = cosf(theta);
|
4105
4420
|
|
@@ -4109,7 +4424,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4109
4424
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4110
4425
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4111
4426
|
|
4112
|
-
const float block_theta =
|
4427
|
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
4113
4428
|
const float sin_block_theta = sinf(block_theta);
|
4114
4429
|
const float cos_block_theta = cosf(block_theta);
|
4115
4430
|
|
@@ -4558,7 +4873,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4558
4873
|
const int compute_capability = g_compute_capabilities[id];
|
4559
4874
|
|
4560
4875
|
int mmq_x, mmq_y, nwarps;
|
4561
|
-
if (compute_capability >=
|
4876
|
+
if (compute_capability >= CC_RDNA2) {
|
4877
|
+
mmq_x = MMQ_X_Q4_0_RDNA2;
|
4878
|
+
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
4879
|
+
nwarps = NWARPS_Q4_0_RDNA2;
|
4880
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4881
|
+
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4882
|
+
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4883
|
+
nwarps = NWARPS_Q4_0_RDNA1;
|
4884
|
+
} else if (compute_capability >= CC_TURING) {
|
4562
4885
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4563
4886
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4564
4887
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4595,7 +4918,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4595
4918
|
const int compute_capability = g_compute_capabilities[id];
|
4596
4919
|
|
4597
4920
|
int mmq_x, mmq_y, nwarps;
|
4598
|
-
if (compute_capability >=
|
4921
|
+
if (compute_capability >= CC_RDNA2) {
|
4922
|
+
mmq_x = MMQ_X_Q4_1_RDNA2;
|
4923
|
+
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
4924
|
+
nwarps = NWARPS_Q4_1_RDNA2;
|
4925
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4926
|
+
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4927
|
+
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4928
|
+
nwarps = NWARPS_Q4_1_RDNA1;
|
4929
|
+
} else if (compute_capability >= CC_TURING) {
|
4599
4930
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4600
4931
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4601
4932
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4632,7 +4963,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4632
4963
|
const int compute_capability = g_compute_capabilities[id];
|
4633
4964
|
|
4634
4965
|
int mmq_x, mmq_y, nwarps;
|
4635
|
-
if (compute_capability >=
|
4966
|
+
if (compute_capability >= CC_RDNA2) {
|
4967
|
+
mmq_x = MMQ_X_Q5_0_RDNA2;
|
4968
|
+
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
4969
|
+
nwarps = NWARPS_Q5_0_RDNA2;
|
4970
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4971
|
+
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4972
|
+
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4973
|
+
nwarps = NWARPS_Q5_0_RDNA1;
|
4974
|
+
} else if (compute_capability >= CC_TURING) {
|
4636
4975
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4637
4976
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4638
4977
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -4669,7 +5008,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4669
5008
|
const int compute_capability = g_compute_capabilities[id];
|
4670
5009
|
|
4671
5010
|
int mmq_x, mmq_y, nwarps;
|
4672
|
-
if (compute_capability >=
|
5011
|
+
if (compute_capability >= CC_RDNA2) {
|
5012
|
+
mmq_x = MMQ_X_Q5_1_RDNA2;
|
5013
|
+
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
5014
|
+
nwarps = NWARPS_Q5_1_RDNA2;
|
5015
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5016
|
+
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5017
|
+
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5018
|
+
nwarps = NWARPS_Q5_1_RDNA1;
|
5019
|
+
} else if (compute_capability >= CC_TURING) {
|
4673
5020
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4674
5021
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4675
5022
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -4706,7 +5053,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4706
5053
|
const int compute_capability = g_compute_capabilities[id];
|
4707
5054
|
|
4708
5055
|
int mmq_x, mmq_y, nwarps;
|
4709
|
-
if (compute_capability >=
|
5056
|
+
if (compute_capability >= CC_RDNA2) {
|
5057
|
+
mmq_x = MMQ_X_Q8_0_RDNA2;
|
5058
|
+
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
5059
|
+
nwarps = NWARPS_Q8_0_RDNA2;
|
5060
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5061
|
+
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5062
|
+
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5063
|
+
nwarps = NWARPS_Q8_0_RDNA1;
|
5064
|
+
} else if (compute_capability >= CC_TURING) {
|
4710
5065
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4711
5066
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4712
5067
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -4743,7 +5098,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4743
5098
|
const int compute_capability = g_compute_capabilities[id];
|
4744
5099
|
|
4745
5100
|
int mmq_x, mmq_y, nwarps;
|
4746
|
-
if (compute_capability >=
|
5101
|
+
if (compute_capability >= CC_RDNA2) {
|
5102
|
+
mmq_x = MMQ_X_Q2_K_RDNA2;
|
5103
|
+
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
5104
|
+
nwarps = NWARPS_Q2_K_RDNA2;
|
5105
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5106
|
+
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5107
|
+
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5108
|
+
nwarps = NWARPS_Q2_K_RDNA1;
|
5109
|
+
} else if (compute_capability >= CC_TURING) {
|
4747
5110
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4748
5111
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4749
5112
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -4782,7 +5145,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4782
5145
|
const int compute_capability = g_compute_capabilities[id];
|
4783
5146
|
|
4784
5147
|
int mmq_x, mmq_y, nwarps;
|
4785
|
-
if (compute_capability >=
|
5148
|
+
if (compute_capability >= CC_RDNA2) {
|
5149
|
+
mmq_x = MMQ_X_Q3_K_RDNA2;
|
5150
|
+
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
5151
|
+
nwarps = NWARPS_Q3_K_RDNA2;
|
5152
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5153
|
+
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5154
|
+
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5155
|
+
nwarps = NWARPS_Q3_K_RDNA1;
|
5156
|
+
} else if (compute_capability >= CC_TURING) {
|
4786
5157
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4787
5158
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4788
5159
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -4820,7 +5191,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4820
5191
|
const int compute_capability = g_compute_capabilities[id];
|
4821
5192
|
|
4822
5193
|
int mmq_x, mmq_y, nwarps;
|
4823
|
-
if (compute_capability >=
|
5194
|
+
if (compute_capability >= CC_RDNA2) {
|
5195
|
+
mmq_x = MMQ_X_Q4_K_RDNA2;
|
5196
|
+
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
5197
|
+
nwarps = NWARPS_Q4_K_RDNA2;
|
5198
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5199
|
+
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5200
|
+
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5201
|
+
nwarps = NWARPS_Q4_K_RDNA1;
|
5202
|
+
} else if (compute_capability >= CC_TURING) {
|
4824
5203
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4825
5204
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4826
5205
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4857,7 +5236,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4857
5236
|
const int compute_capability = g_compute_capabilities[id];
|
4858
5237
|
|
4859
5238
|
int mmq_x, mmq_y, nwarps;
|
4860
|
-
if (compute_capability >=
|
5239
|
+
if (compute_capability >= CC_RDNA2) {
|
5240
|
+
mmq_x = MMQ_X_Q5_K_RDNA2;
|
5241
|
+
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
5242
|
+
nwarps = NWARPS_Q5_K_RDNA2;
|
5243
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5244
|
+
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5245
|
+
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5246
|
+
nwarps = NWARPS_Q5_K_RDNA1;
|
5247
|
+
} else if (compute_capability >= CC_TURING) {
|
4861
5248
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4862
5249
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4863
5250
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4894,7 +5281,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4894
5281
|
const int compute_capability = g_compute_capabilities[id];
|
4895
5282
|
|
4896
5283
|
int mmq_x, mmq_y, nwarps;
|
4897
|
-
if (compute_capability >=
|
5284
|
+
if (compute_capability >= CC_RDNA2) {
|
5285
|
+
mmq_x = MMQ_X_Q6_K_RDNA2;
|
5286
|
+
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
5287
|
+
nwarps = NWARPS_Q6_K_RDNA2;
|
5288
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5289
|
+
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5290
|
+
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5291
|
+
nwarps = NWARPS_Q6_K_RDNA1;
|
5292
|
+
} else if (compute_capability >= CC_TURING) {
|
4898
5293
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4899
5294
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4900
5295
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4984,12 +5379,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
4984
5379
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4985
5380
|
}
|
4986
5381
|
|
4987
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4988
|
-
|
4989
|
-
|
4990
|
-
const
|
5382
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
5383
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5384
|
+
GGML_ASSERT(ncols % 4 == 0);
|
5385
|
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5386
|
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
4991
5387
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4992
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5388
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
4993
5389
|
}
|
4994
5390
|
|
4995
5391
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5127,25 +5523,30 @@ void ggml_init_cublas() {
|
|
5127
5523
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5128
5524
|
int64_t total_vram = 0;
|
5129
5525
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5130
|
-
for (
|
5526
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5131
5527
|
cudaDeviceProp prop;
|
5132
5528
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5133
|
-
fprintf(stderr, " Device %
|
5529
|
+
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5134
5530
|
|
5135
5531
|
g_tensor_split[id] = total_vram;
|
5136
5532
|
total_vram += prop.totalGlobalMem;
|
5137
|
-
|
5533
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5534
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
5535
|
+
#else
|
5138
5536
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5537
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5139
5538
|
}
|
5140
|
-
for (
|
5539
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5141
5540
|
g_tensor_split[id] /= total_vram;
|
5142
5541
|
}
|
5143
5542
|
|
5144
|
-
for (
|
5145
|
-
CUDA_CHECK(
|
5543
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5544
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
5146
5545
|
|
5147
|
-
// create
|
5148
|
-
|
5546
|
+
// create cuda streams
|
5547
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
5548
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5549
|
+
}
|
5149
5550
|
|
5150
5551
|
// create cublas handle
|
5151
5552
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -5214,7 +5615,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5214
5615
|
if (src->backend == GGML_BACKEND_CPU) {
|
5215
5616
|
kind = cudaMemcpyHostToDevice;
|
5216
5617
|
src_ptr = (char *) src->data;
|
5217
|
-
} else if (src->backend == GGML_BACKEND_GPU) {
|
5618
|
+
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5619
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5218
5620
|
kind = cudaMemcpyDeviceToDevice;
|
5219
5621
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5220
5622
|
int id;
|
@@ -5253,236 +5655,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5253
5655
|
}
|
5254
5656
|
|
5255
5657
|
inline void ggml_cuda_op_add(
|
5256
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5257
|
-
float *
|
5258
|
-
cudaStream_t & cudaStream_main){
|
5259
|
-
|
5260
|
-
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
5261
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5262
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5658
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5659
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5263
5660
|
|
5264
|
-
|
5265
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5661
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5266
5662
|
|
5267
5663
|
const int64_t ne10 = src1->ne[0];
|
5268
5664
|
const int64_t ne11 = src1->ne[1];
|
5269
5665
|
|
5270
|
-
// compute
|
5271
5666
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
5272
|
-
add_f32_cuda(
|
5667
|
+
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5273
5668
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5274
|
-
add_f16_f32_f16_cuda((half *)
|
5669
|
+
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
5275
5670
|
} else {
|
5276
5671
|
GGML_ASSERT(false);
|
5277
5672
|
}
|
5278
5673
|
|
5279
5674
|
(void) src1;
|
5280
5675
|
(void) dst;
|
5281
|
-
(void) src0_ddq_i;
|
5282
|
-
(void) i02;
|
5283
|
-
(void) i1;
|
5284
5676
|
}
|
5285
5677
|
|
5286
5678
|
inline void ggml_cuda_op_mul(
|
5287
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5288
|
-
float *
|
5289
|
-
cudaStream_t & cudaStream_main){
|
5679
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5680
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5290
5681
|
|
5291
|
-
GGML_ASSERT(
|
5292
|
-
GGML_ASSERT(
|
5293
|
-
GGML_ASSERT(
|
5294
|
-
|
5295
|
-
const int64_t ne00 = src0->ne[0];
|
5296
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5682
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5683
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5684
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5297
5685
|
|
5298
5686
|
const int64_t ne10 = src1->ne[0];
|
5299
5687
|
const int64_t ne11 = src1->ne[1];
|
5300
5688
|
|
5301
|
-
mul_f32_cuda(
|
5689
|
+
mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5302
5690
|
|
5303
5691
|
(void) dst;
|
5304
|
-
(void) src0_ddq_i;
|
5305
|
-
(void) i02;
|
5306
|
-
(void) i1;
|
5307
5692
|
}
|
5308
5693
|
|
5309
5694
|
inline void ggml_cuda_op_gelu(
|
5310
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5311
|
-
float *
|
5312
|
-
cudaStream_t & cudaStream_main){
|
5313
|
-
|
5314
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5315
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5695
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5696
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5316
5697
|
|
5317
|
-
|
5318
|
-
|
5698
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5699
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5319
5700
|
|
5320
|
-
|
5321
|
-
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5701
|
+
gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5322
5702
|
|
5323
5703
|
(void) src1;
|
5324
5704
|
(void) dst;
|
5325
|
-
(void)
|
5326
|
-
(void) src1_ddf_i;
|
5327
|
-
(void) i02;
|
5328
|
-
(void) i1;
|
5705
|
+
(void) src1_dd;
|
5329
5706
|
}
|
5330
5707
|
|
5331
5708
|
inline void ggml_cuda_op_silu(
|
5332
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5333
|
-
float *
|
5334
|
-
cudaStream_t & cudaStream_main){
|
5335
|
-
|
5336
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5337
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5709
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5710
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5338
5711
|
|
5339
|
-
|
5340
|
-
|
5712
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5713
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5341
5714
|
|
5342
|
-
|
5343
|
-
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5715
|
+
silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5344
5716
|
|
5345
5717
|
(void) src1;
|
5346
5718
|
(void) dst;
|
5347
|
-
(void)
|
5348
|
-
(void) src1_ddf_i;
|
5349
|
-
(void) i02;
|
5350
|
-
(void) i1;
|
5719
|
+
(void) src1_dd;
|
5351
5720
|
}
|
5352
5721
|
|
5353
5722
|
inline void ggml_cuda_op_norm(
|
5354
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5355
|
-
float *
|
5356
|
-
cudaStream_t & cudaStream_main){
|
5723
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5724
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5357
5725
|
|
5358
|
-
GGML_ASSERT(
|
5359
|
-
GGML_ASSERT(
|
5726
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5727
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5360
5728
|
|
5361
5729
|
const int64_t ne00 = src0->ne[0];
|
5362
|
-
const int64_t
|
5730
|
+
const int64_t nrows = ggml_nrows(src0);
|
5363
5731
|
|
5364
|
-
|
5365
|
-
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
5732
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5366
5733
|
|
5367
5734
|
(void) src1;
|
5368
5735
|
(void) dst;
|
5369
|
-
(void)
|
5370
|
-
(void) src1_ddf_i;
|
5371
|
-
(void) i02;
|
5372
|
-
(void) i1;
|
5736
|
+
(void) src1_dd;
|
5373
5737
|
}
|
5374
5738
|
|
5375
5739
|
inline void ggml_cuda_op_rms_norm(
|
5376
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5377
|
-
float *
|
5378
|
-
cudaStream_t & cudaStream_main){
|
5740
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5741
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5379
5742
|
|
5380
|
-
GGML_ASSERT(
|
5381
|
-
GGML_ASSERT(
|
5743
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5744
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5382
5745
|
|
5383
5746
|
const int64_t ne00 = src0->ne[0];
|
5384
|
-
const int64_t
|
5747
|
+
const int64_t nrows = ggml_nrows(src0);
|
5385
5748
|
|
5386
5749
|
float eps;
|
5387
5750
|
memcpy(&eps, dst->op_params, sizeof(float));
|
5388
5751
|
|
5389
|
-
|
5390
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
5752
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
5391
5753
|
|
5392
5754
|
(void) src1;
|
5393
5755
|
(void) dst;
|
5394
|
-
(void)
|
5395
|
-
(void) src1_ddf_i;
|
5396
|
-
(void) i02;
|
5397
|
-
(void) i1;
|
5756
|
+
(void) src1_dd;
|
5398
5757
|
}
|
5399
5758
|
|
5400
5759
|
inline void ggml_cuda_op_mul_mat_q(
|
5401
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5402
|
-
|
5403
|
-
cudaStream_t &
|
5404
|
-
|
5405
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5406
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5407
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5760
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5761
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5762
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5408
5763
|
|
5409
5764
|
const int64_t ne00 = src0->ne[0];
|
5410
5765
|
|
5411
5766
|
const int64_t ne10 = src1->ne[0];
|
5412
|
-
const int64_t ne11 = src1->ne[1];
|
5413
5767
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
5414
5768
|
|
5415
5769
|
const int64_t ne0 = dst->ne[0];
|
5416
5770
|
|
5417
|
-
const int64_t
|
5771
|
+
const int64_t row_diff = row_high - row_low;
|
5418
5772
|
|
5419
5773
|
int id;
|
5420
5774
|
CUDA_CHECK(cudaGetDevice(&id));
|
5421
5775
|
|
5422
5776
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5423
5777
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
5424
|
-
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
5425
|
-
|
5426
|
-
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
5427
|
-
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5428
|
-
size_t as;
|
5429
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
5430
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
5778
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5431
5779
|
|
5432
5780
|
switch (src0->type) {
|
5433
5781
|
case GGML_TYPE_Q4_0:
|
5434
|
-
ggml_mul_mat_q4_0_q8_1_cuda(
|
5782
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5435
5783
|
break;
|
5436
5784
|
case GGML_TYPE_Q4_1:
|
5437
|
-
ggml_mul_mat_q4_1_q8_1_cuda(
|
5785
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5438
5786
|
break;
|
5439
5787
|
case GGML_TYPE_Q5_0:
|
5440
|
-
ggml_mul_mat_q5_0_q8_1_cuda(
|
5788
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5441
5789
|
break;
|
5442
5790
|
case GGML_TYPE_Q5_1:
|
5443
|
-
ggml_mul_mat_q5_1_q8_1_cuda(
|
5791
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5444
5792
|
break;
|
5445
5793
|
case GGML_TYPE_Q8_0:
|
5446
|
-
ggml_mul_mat_q8_0_q8_1_cuda(
|
5794
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5447
5795
|
break;
|
5448
5796
|
case GGML_TYPE_Q2_K:
|
5449
|
-
ggml_mul_mat_q2_K_q8_1_cuda(
|
5797
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5450
5798
|
break;
|
5451
5799
|
case GGML_TYPE_Q3_K:
|
5452
|
-
ggml_mul_mat_q3_K_q8_1_cuda(
|
5800
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5453
5801
|
break;
|
5454
5802
|
case GGML_TYPE_Q4_K:
|
5455
|
-
ggml_mul_mat_q4_K_q8_1_cuda(
|
5803
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5456
5804
|
break;
|
5457
5805
|
case GGML_TYPE_Q5_K:
|
5458
|
-
ggml_mul_mat_q5_K_q8_1_cuda(
|
5806
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5459
5807
|
break;
|
5460
5808
|
case GGML_TYPE_Q6_K:
|
5461
|
-
ggml_mul_mat_q6_K_q8_1_cuda(
|
5809
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5462
5810
|
break;
|
5463
5811
|
default:
|
5464
5812
|
GGML_ASSERT(false);
|
5465
5813
|
break;
|
5466
5814
|
}
|
5467
5815
|
|
5468
|
-
ggml_cuda_pool_free(src1_q8_1, as);
|
5469
|
-
|
5470
5816
|
(void) src1;
|
5471
5817
|
(void) dst;
|
5472
|
-
(void)
|
5473
|
-
(void) i02;
|
5474
|
-
(void) i1;
|
5818
|
+
(void) src1_ddf_i;
|
5475
5819
|
}
|
5476
5820
|
|
5477
5821
|
static int64_t get_row_rounding(ggml_type type) {
|
5478
|
-
|
5479
|
-
|
5480
|
-
|
5481
|
-
|
5482
|
-
|
5822
|
+
int64_t min_compute_capability = INT_MAX;
|
5823
|
+
int64_t max_compute_capability = INT_MIN;
|
5824
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5825
|
+
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5826
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5827
|
+
min_compute_capability = g_compute_capabilities[id];
|
5828
|
+
}
|
5829
|
+
if (max_compute_capability < g_compute_capabilities[id]) {
|
5830
|
+
max_compute_capability = g_compute_capabilities[id];
|
5831
|
+
}
|
5483
5832
|
}
|
5484
5833
|
}
|
5485
5834
|
|
5835
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5836
|
+
switch(type) {
|
5837
|
+
case GGML_TYPE_Q4_0:
|
5838
|
+
case GGML_TYPE_Q4_1:
|
5839
|
+
case GGML_TYPE_Q5_0:
|
5840
|
+
case GGML_TYPE_Q5_1:
|
5841
|
+
case GGML_TYPE_Q8_0:
|
5842
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5843
|
+
case GGML_TYPE_F16:
|
5844
|
+
return 1;
|
5845
|
+
case GGML_TYPE_Q2_K:
|
5846
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
5847
|
+
case GGML_TYPE_Q3_K:
|
5848
|
+
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
5849
|
+
case GGML_TYPE_Q4_K:
|
5850
|
+
case GGML_TYPE_Q5_K:
|
5851
|
+
case GGML_TYPE_Q6_K:
|
5852
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5853
|
+
default:
|
5854
|
+
GGML_ASSERT(false);
|
5855
|
+
}
|
5856
|
+
#else
|
5486
5857
|
switch(type) {
|
5487
5858
|
case GGML_TYPE_Q4_0:
|
5488
5859
|
case GGML_TYPE_Q4_1:
|
@@ -5503,170 +5874,147 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5503
5874
|
default:
|
5504
5875
|
GGML_ASSERT(false);
|
5505
5876
|
}
|
5877
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5506
5878
|
}
|
5507
5879
|
|
5508
|
-
inline void
|
5509
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5510
|
-
|
5511
|
-
cudaStream_t &
|
5512
|
-
|
5513
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5514
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5515
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5880
|
+
inline void ggml_cuda_op_mul_mat_vec_q(
|
5881
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5882
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5883
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5516
5884
|
|
5517
5885
|
const int64_t ne00 = src0->ne[0];
|
5518
|
-
const int64_t
|
5886
|
+
const int64_t row_diff = row_high - row_low;
|
5519
5887
|
|
5520
|
-
|
5521
|
-
|
5522
|
-
|
5523
|
-
|
5524
|
-
|
5525
|
-
|
5888
|
+
switch (src0->type) {
|
5889
|
+
case GGML_TYPE_Q4_0:
|
5890
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5891
|
+
break;
|
5892
|
+
case GGML_TYPE_Q4_1:
|
5893
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5894
|
+
break;
|
5895
|
+
case GGML_TYPE_Q5_0:
|
5896
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5897
|
+
break;
|
5898
|
+
case GGML_TYPE_Q5_1:
|
5899
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5900
|
+
break;
|
5901
|
+
case GGML_TYPE_Q8_0:
|
5902
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5903
|
+
break;
|
5904
|
+
case GGML_TYPE_Q2_K:
|
5905
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5906
|
+
break;
|
5907
|
+
case GGML_TYPE_Q3_K:
|
5908
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5909
|
+
break;
|
5910
|
+
case GGML_TYPE_Q4_K:
|
5911
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5912
|
+
break;
|
5913
|
+
case GGML_TYPE_Q5_K:
|
5914
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5915
|
+
break;
|
5916
|
+
case GGML_TYPE_Q6_K:
|
5917
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5918
|
+
break;
|
5919
|
+
default:
|
5920
|
+
GGML_ASSERT(false);
|
5921
|
+
break;
|
5922
|
+
}
|
5526
5923
|
|
5527
|
-
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5531
|
-
|
5532
|
-
|
5533
|
-
#if QK_K == 256
|
5534
|
-
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
5535
|
-
src0->type == GGML_TYPE_Q2_K ||
|
5536
|
-
src0->type == GGML_TYPE_Q3_K ||
|
5537
|
-
src0->type == GGML_TYPE_Q4_K ||
|
5538
|
-
src0->type == GGML_TYPE_Q5_K ||
|
5539
|
-
src0->type == GGML_TYPE_Q6_K;
|
5540
|
-
#endif // QK_K == 256
|
5541
|
-
|
5542
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
5543
|
-
#endif
|
5924
|
+
(void) src1;
|
5925
|
+
(void) dst;
|
5926
|
+
(void) src1_ddf_i;
|
5927
|
+
(void) src1_ncols;
|
5928
|
+
(void) src1_padded_row_size;
|
5929
|
+
}
|
5544
5930
|
|
5545
|
-
|
5546
|
-
|
5547
|
-
|
5548
|
-
|
5549
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
5550
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
5551
|
-
|
5552
|
-
switch (src0->type) {
|
5553
|
-
case GGML_TYPE_Q4_0:
|
5554
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5555
|
-
break;
|
5556
|
-
case GGML_TYPE_Q4_1:
|
5557
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5558
|
-
break;
|
5559
|
-
case GGML_TYPE_Q5_0:
|
5560
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5561
|
-
break;
|
5562
|
-
case GGML_TYPE_Q5_1:
|
5563
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5564
|
-
break;
|
5565
|
-
case GGML_TYPE_Q8_0:
|
5566
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5567
|
-
break;
|
5568
|
-
case GGML_TYPE_Q2_K:
|
5569
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5570
|
-
break;
|
5571
|
-
case GGML_TYPE_Q3_K:
|
5572
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5573
|
-
break;
|
5574
|
-
case GGML_TYPE_Q4_K:
|
5575
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5576
|
-
break;
|
5577
|
-
case GGML_TYPE_Q5_K:
|
5578
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5579
|
-
break;
|
5580
|
-
case GGML_TYPE_Q6_K:
|
5581
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5582
|
-
break;
|
5583
|
-
default:
|
5584
|
-
GGML_ASSERT(false);
|
5585
|
-
break;
|
5586
|
-
}
|
5931
|
+
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
5932
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5933
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5934
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5587
5935
|
|
5588
|
-
|
5589
|
-
|
5590
|
-
|
5936
|
+
const int64_t ne00 = src0->ne[0];
|
5937
|
+
const int64_t row_diff = row_high - row_low;
|
5938
|
+
|
5939
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
5591
5940
|
#ifdef GGML_CUDA_F16
|
5592
|
-
|
5593
|
-
|
5594
|
-
|
5595
|
-
|
5596
|
-
|
5597
|
-
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5602
|
-
|
5603
|
-
|
5604
|
-
|
5941
|
+
size_t ash;
|
5942
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
5943
|
+
|
5944
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
5945
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
5946
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
5947
|
+
|
5948
|
+
if (src1_convert_f16) {
|
5949
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
5950
|
+
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
5951
|
+
ne00, 1, sizeof(float), 0, 0,
|
5952
|
+
ne00, 1, sizeof(half), 0, 0, stream);
|
5953
|
+
}
|
5605
5954
|
#else
|
5606
|
-
|
5955
|
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
5607
5956
|
#endif // GGML_CUDA_F16
|
5608
5957
|
|
5609
|
-
|
5610
|
-
|
5611
|
-
|
5612
|
-
|
5613
|
-
|
5614
|
-
|
5615
|
-
|
5616
|
-
|
5617
|
-
|
5618
|
-
|
5619
|
-
|
5620
|
-
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5630
|
-
|
5631
|
-
|
5632
|
-
|
5633
|
-
|
5634
|
-
|
5635
|
-
|
5636
|
-
|
5637
|
-
|
5638
|
-
|
5639
|
-
|
5640
|
-
|
5641
|
-
|
5642
|
-
|
5643
|
-
|
5644
|
-
|
5645
|
-
|
5646
|
-
|
5958
|
+
switch (src0->type) {
|
5959
|
+
case GGML_TYPE_Q4_0:
|
5960
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5961
|
+
break;
|
5962
|
+
case GGML_TYPE_Q4_1:
|
5963
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5964
|
+
break;
|
5965
|
+
case GGML_TYPE_Q5_0:
|
5966
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5967
|
+
break;
|
5968
|
+
case GGML_TYPE_Q5_1:
|
5969
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5970
|
+
break;
|
5971
|
+
case GGML_TYPE_Q8_0:
|
5972
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5973
|
+
break;
|
5974
|
+
case GGML_TYPE_Q2_K:
|
5975
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5976
|
+
break;
|
5977
|
+
case GGML_TYPE_Q3_K:
|
5978
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5979
|
+
break;
|
5980
|
+
case GGML_TYPE_Q4_K:
|
5981
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5982
|
+
break;
|
5983
|
+
case GGML_TYPE_Q5_K:
|
5984
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5985
|
+
break;
|
5986
|
+
case GGML_TYPE_Q6_K:
|
5987
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5988
|
+
break;
|
5989
|
+
case GGML_TYPE_F16:
|
5990
|
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5991
|
+
break;
|
5992
|
+
default:
|
5993
|
+
GGML_ASSERT(false);
|
5994
|
+
break;
|
5995
|
+
}
|
5647
5996
|
|
5648
5997
|
#ifdef GGML_CUDA_F16
|
5649
|
-
|
5650
|
-
|
5651
|
-
}
|
5652
|
-
#endif // GGML_CUDA_F16
|
5998
|
+
if (src1_convert_f16) {
|
5999
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
5653
6000
|
}
|
6001
|
+
#endif // GGML_CUDA_F16
|
5654
6002
|
|
5655
6003
|
(void) src1;
|
5656
6004
|
(void) dst;
|
5657
|
-
(void)
|
5658
|
-
(void)
|
5659
|
-
(void)
|
6005
|
+
(void) src1_ddq_i;
|
6006
|
+
(void) src1_ncols;
|
6007
|
+
(void) src1_padded_row_size;
|
5660
6008
|
}
|
5661
6009
|
|
5662
6010
|
inline void ggml_cuda_op_mul_mat_cublas(
|
5663
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5664
|
-
|
5665
|
-
cudaStream_t &
|
6011
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
6012
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6013
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5666
6014
|
|
5667
|
-
GGML_ASSERT(
|
6015
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
5668
6016
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
5669
|
-
GGML_ASSERT(
|
6017
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
5670
6018
|
|
5671
6019
|
const float alpha = 1.0f;
|
5672
6020
|
const float beta = 0.0f;
|
@@ -5674,43 +6022,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
5674
6022
|
const int64_t ne00 = src0->ne[0];
|
5675
6023
|
|
5676
6024
|
const int64_t ne10 = src1->ne[0];
|
5677
|
-
const int64_t ne11 = src1->ne[1];
|
5678
6025
|
|
5679
6026
|
const int64_t ne0 = dst->ne[0];
|
5680
|
-
const int64_t
|
6027
|
+
const int64_t row_diff = row_high - row_low;
|
6028
|
+
|
6029
|
+
float * src0_ddq_as_f32;
|
6030
|
+
size_t src0_as = 0;
|
6031
|
+
|
6032
|
+
if (src0->type != GGML_TYPE_F32) {
|
6033
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6034
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6035
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6036
|
+
}
|
6037
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
5681
6038
|
|
5682
6039
|
int id;
|
5683
6040
|
CUDA_CHECK(cudaGetDevice(&id));
|
5684
6041
|
|
5685
6042
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5686
6043
|
// ldc == nrows of the matrix that cuBLAS writes into
|
5687
|
-
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
6044
|
+
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5688
6045
|
|
5689
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id],
|
6046
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
5690
6047
|
CUBLAS_CHECK(
|
5691
6048
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
5692
|
-
|
6049
|
+
row_diff, src1_ncols, ne10,
|
5693
6050
|
&alpha, src0_ddf_i, ne00,
|
5694
|
-
src1_ddf_i,
|
5695
|
-
&beta,
|
6051
|
+
src1_ddf_i, ne10,
|
6052
|
+
&beta, dst_dd_i, ldc));
|
6053
|
+
|
6054
|
+
if (src0_as > 0) {
|
6055
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6056
|
+
}
|
5696
6057
|
|
5697
6058
|
(void) dst;
|
5698
|
-
(void)
|
5699
|
-
(void)
|
5700
|
-
(void) i1;
|
6059
|
+
(void) src1_ddq_i;
|
6060
|
+
(void) src1_padded_row_size;
|
5701
6061
|
}
|
5702
6062
|
|
5703
6063
|
inline void ggml_cuda_op_rope(
|
5704
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5705
|
-
float *
|
5706
|
-
cudaStream_t & cudaStream_main){
|
6064
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6065
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5707
6066
|
|
5708
|
-
GGML_ASSERT(
|
5709
|
-
GGML_ASSERT(
|
6067
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6068
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5710
6069
|
|
5711
6070
|
const int64_t ne00 = src0->ne[0];
|
5712
6071
|
const int64_t ne01 = src0->ne[1];
|
5713
|
-
const int64_t
|
6072
|
+
const int64_t nrows = ggml_nrows(src0);
|
5714
6073
|
|
5715
6074
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5716
6075
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
@@ -5723,44 +6082,37 @@ inline void ggml_cuda_op_rope(
|
|
5723
6082
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
5724
6083
|
|
5725
6084
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6085
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5726
6086
|
|
5727
6087
|
const bool is_neox = mode & 2;
|
5728
6088
|
const bool is_glm = mode & 4;
|
5729
6089
|
|
5730
6090
|
// compute
|
5731
6091
|
if (is_glm) {
|
5732
|
-
|
5733
|
-
const float id_p = min(p, n_ctx - 2.f);
|
5734
|
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5735
|
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
6092
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
5736
6093
|
} else if (is_neox) {
|
5737
6094
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5738
|
-
|
5739
|
-
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6095
|
+
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5740
6096
|
} else {
|
5741
|
-
|
5742
|
-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6097
|
+
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5743
6098
|
}
|
5744
6099
|
|
5745
6100
|
(void) src1;
|
5746
6101
|
(void) dst;
|
5747
|
-
(void)
|
5748
|
-
(void) src1_ddf_i;
|
5749
|
-
(void) i1;
|
6102
|
+
(void) src1_dd;
|
5750
6103
|
}
|
5751
6104
|
|
5752
6105
|
inline void ggml_cuda_op_alibi(
|
5753
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5754
|
-
float *
|
5755
|
-
cudaStream_t & cudaStream_main){
|
6106
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6107
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5756
6108
|
|
5757
|
-
GGML_ASSERT(
|
5758
|
-
GGML_ASSERT(
|
6109
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6110
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5759
6111
|
|
5760
6112
|
const int64_t ne00 = src0->ne[0];
|
5761
6113
|
const int64_t ne01 = src0->ne[1];
|
5762
6114
|
const int64_t ne02 = src0->ne[2];
|
5763
|
-
const int64_t
|
6115
|
+
const int64_t nrows = ggml_nrows(src0);
|
5764
6116
|
|
5765
6117
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5766
6118
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
@@ -5775,334 +6127,393 @@ inline void ggml_cuda_op_alibi(
|
|
5775
6127
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5776
6128
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5777
6129
|
|
5778
|
-
|
5779
|
-
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
6130
|
+
alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
5780
6131
|
|
5781
6132
|
(void) src1;
|
5782
|
-
(void)
|
5783
|
-
(void) src1_ddf_i;
|
5784
|
-
(void) i1;
|
6133
|
+
(void) src1_dd;
|
5785
6134
|
}
|
5786
6135
|
|
5787
6136
|
inline void ggml_cuda_op_diag_mask_inf(
|
5788
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5789
|
-
float *
|
5790
|
-
cudaStream_t & cudaStream_main){
|
6137
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6138
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5791
6139
|
|
5792
|
-
GGML_ASSERT(
|
5793
|
-
GGML_ASSERT(
|
6140
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6141
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5794
6142
|
|
5795
6143
|
const int64_t ne00 = src0->ne[0];
|
5796
6144
|
const int64_t ne01 = src0->ne[1];
|
5797
|
-
const
|
6145
|
+
const int nrows0 = ggml_nrows(src0);
|
5798
6146
|
|
5799
6147
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5800
6148
|
|
5801
|
-
|
5802
|
-
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
6149
|
+
diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
5803
6150
|
|
5804
6151
|
(void) src1;
|
5805
6152
|
(void) dst;
|
5806
|
-
(void)
|
5807
|
-
(void) src1_ddf_i;
|
5808
|
-
(void) i02;
|
5809
|
-
(void) i1;
|
6153
|
+
(void) src1_dd;
|
5810
6154
|
}
|
5811
6155
|
|
5812
6156
|
inline void ggml_cuda_op_soft_max(
|
5813
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5814
|
-
float *
|
5815
|
-
cudaStream_t & cudaStream_main){
|
6157
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6158
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5816
6159
|
|
5817
|
-
GGML_ASSERT(
|
5818
|
-
GGML_ASSERT(
|
6160
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6161
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5819
6162
|
|
5820
6163
|
const int64_t ne00 = src0->ne[0];
|
5821
|
-
const int64_t
|
6164
|
+
const int64_t nrows = ggml_nrows(src0);
|
5822
6165
|
|
5823
|
-
|
5824
|
-
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
6166
|
+
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5825
6167
|
|
5826
6168
|
(void) src1;
|
5827
6169
|
(void) dst;
|
5828
|
-
(void)
|
5829
|
-
(void) src1_ddf_i;
|
5830
|
-
(void) i02;
|
5831
|
-
(void) i1;
|
6170
|
+
(void) src1_dd;
|
5832
6171
|
}
|
5833
6172
|
|
5834
6173
|
inline void ggml_cuda_op_scale(
|
5835
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5836
|
-
float *
|
5837
|
-
cudaStream_t & cudaStream_main){
|
6174
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6175
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5838
6176
|
|
5839
|
-
GGML_ASSERT(
|
5840
|
-
GGML_ASSERT(
|
6177
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6178
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6179
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5841
6180
|
|
5842
6181
|
const float scale = ((float *) src1->data)[0];
|
5843
6182
|
|
5844
|
-
|
5845
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5846
|
-
|
5847
|
-
// compute
|
5848
|
-
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
6183
|
+
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
5849
6184
|
CUDA_CHECK(cudaGetLastError());
|
5850
6185
|
|
5851
6186
|
(void) src1;
|
5852
6187
|
(void) dst;
|
5853
|
-
(void)
|
5854
|
-
|
5855
|
-
|
5856
|
-
|
6188
|
+
(void) src1_dd;
|
6189
|
+
}
|
6190
|
+
|
6191
|
+
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6192
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
6193
|
+
|
6194
|
+
const bool use_src1 = src1 != nullptr;
|
6195
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6196
|
+
|
6197
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6198
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6199
|
+
|
6200
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6201
|
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6202
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6203
|
+
|
6204
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6205
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
6206
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
6207
|
+
|
6208
|
+
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
6209
|
+
|
6210
|
+
// dd = data device
|
6211
|
+
float * src0_ddf = nullptr;
|
6212
|
+
float * src1_ddf = nullptr;
|
6213
|
+
float * dst_ddf = nullptr;
|
6214
|
+
|
6215
|
+
// as = actual size
|
6216
|
+
size_t src0_asf = 0;
|
6217
|
+
size_t src1_asf = 0;
|
6218
|
+
size_t dst_asf = 0;
|
6219
|
+
|
6220
|
+
ggml_cuda_set_device(g_main_device);
|
6221
|
+
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6222
|
+
|
6223
|
+
if (src0_on_device) {
|
6224
|
+
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
6225
|
+
} else {
|
6226
|
+
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
6227
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
6228
|
+
}
|
6229
|
+
|
6230
|
+
if (use_src1 && !src1_stays_on_host) {
|
6231
|
+
if (src1_on_device) {
|
6232
|
+
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6233
|
+
} else {
|
6234
|
+
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
6235
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
6236
|
+
}
|
6237
|
+
}
|
6238
|
+
if (dst_on_device) {
|
6239
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6240
|
+
} else {
|
6241
|
+
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
6242
|
+
}
|
6243
|
+
|
6244
|
+
// do the computation
|
6245
|
+
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
6246
|
+
CUDA_CHECK(cudaGetLastError());
|
6247
|
+
|
6248
|
+
// copy dst to host if necessary
|
6249
|
+
if (!dst_on_device) {
|
6250
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
6251
|
+
}
|
6252
|
+
|
6253
|
+
if (src0_asf > 0) {
|
6254
|
+
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
6255
|
+
}
|
6256
|
+
if (src1_asf > 0) {
|
6257
|
+
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
6258
|
+
}
|
6259
|
+
if (dst_asf > 0) {
|
6260
|
+
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
6261
|
+
}
|
6262
|
+
|
6263
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
6264
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
6265
|
+
}
|
6266
|
+
}
|
6267
|
+
|
6268
|
+
void ggml_cuda_set_peer_access(const int n_tokens) {
|
6269
|
+
static bool peer_access_enabled = false;
|
6270
|
+
|
6271
|
+
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
6272
|
+
|
6273
|
+
if (peer_access_enabled == enable_peer_access) {
|
6274
|
+
return;
|
6275
|
+
}
|
6276
|
+
|
6277
|
+
#ifdef NDEBUG
|
6278
|
+
for (int id = 0; id < g_device_count; ++id) {
|
6279
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6280
|
+
|
6281
|
+
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
6282
|
+
if (id == id_other) {
|
6283
|
+
continue;
|
6284
|
+
}
|
6285
|
+
if (id != g_main_device && id_other != g_main_device) {
|
6286
|
+
continue;
|
6287
|
+
}
|
6288
|
+
|
6289
|
+
int can_access_peer;
|
6290
|
+
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
6291
|
+
if (can_access_peer) {
|
6292
|
+
if (enable_peer_access) {
|
6293
|
+
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
6294
|
+
} else {
|
6295
|
+
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
6296
|
+
}
|
6297
|
+
}
|
6298
|
+
}
|
6299
|
+
}
|
6300
|
+
#endif // NDEBUG
|
6301
|
+
|
6302
|
+
peer_access_enabled = enable_peer_access;
|
5857
6303
|
}
|
5858
6304
|
|
5859
|
-
static void
|
5860
|
-
|
6305
|
+
static void ggml_cuda_op_mul_mat(
|
6306
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6307
|
+
const bool convert_src1_to_q8_1) {
|
6308
|
+
|
5861
6309
|
const int64_t ne00 = src0->ne[0];
|
5862
6310
|
const int64_t ne01 = src0->ne[1];
|
5863
6311
|
const int64_t ne02 = src0->ne[2];
|
5864
6312
|
const int64_t ne03 = src0->ne[3];
|
5865
6313
|
const int64_t nrows0 = ggml_nrows(src0);
|
5866
6314
|
|
5867
|
-
const
|
5868
|
-
const int64_t
|
5869
|
-
const int64_t
|
5870
|
-
const int64_t
|
5871
|
-
const int64_t
|
5872
|
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6315
|
+
const int64_t ne10 = src1->ne[0];
|
6316
|
+
const int64_t ne11 = src1->ne[1];
|
6317
|
+
const int64_t ne12 = src1->ne[2];
|
6318
|
+
const int64_t ne13 = src1->ne[3];
|
6319
|
+
const int64_t nrows1 = ggml_nrows(src1);
|
5873
6320
|
|
5874
6321
|
GGML_ASSERT(ne03 == ne13);
|
5875
6322
|
|
5876
6323
|
const int64_t ne0 = dst->ne[0];
|
5877
6324
|
const int64_t ne1 = dst->ne[1];
|
5878
6325
|
|
5879
|
-
const int nb2
|
5880
|
-
const int nb3
|
6326
|
+
const int nb2 = dst->nb[2];
|
6327
|
+
const int nb3 = dst->nb[3];
|
6328
|
+
|
6329
|
+
ggml_cuda_set_peer_access(ne11);
|
5881
6330
|
|
5882
6331
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
5883
|
-
GGML_ASSERT(
|
6332
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
5884
6333
|
|
5885
|
-
|
5886
|
-
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
5887
|
-
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
5888
|
-
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
5889
|
-
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
5890
|
-
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
5891
|
-
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
6334
|
+
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
5892
6335
|
|
5893
|
-
const int64_t
|
5894
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
5895
|
-
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
5896
|
-
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
5897
|
-
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
6336
|
+
const int64_t i02_divisor = ne12 / ne02;
|
5898
6337
|
|
5899
6338
|
const size_t src0_ts = ggml_type_size(src0->type);
|
5900
6339
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
6340
|
+
const size_t q8_1_ts = sizeof(block_q8_1);
|
6341
|
+
const size_t q8_1_bs = QK8_1;
|
5901
6342
|
|
5902
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
5903
|
-
struct ggml_tensor_extra_gpu * src1_extra =
|
5904
|
-
struct ggml_tensor_extra_gpu *
|
6343
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6344
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6345
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
5905
6346
|
|
5906
6347
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
5907
6348
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
5908
|
-
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
5909
6349
|
|
5910
|
-
const bool src1_is_contiguous =
|
5911
|
-
const
|
5912
|
-
|
6350
|
+
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
6351
|
+
const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
6352
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5913
6353
|
|
5914
6354
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6355
|
+
GGML_ASSERT(!(split && ne02 > 1));
|
6356
|
+
GGML_ASSERT(!(split && ne03 > 1));
|
5915
6357
|
GGML_ASSERT(!(split && ne02 < ne12));
|
5916
6358
|
|
5917
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
5918
|
-
|
5919
6359
|
// dd = data device
|
5920
|
-
char *
|
5921
|
-
float *
|
5922
|
-
|
5923
|
-
float *
|
5924
|
-
|
5925
|
-
// asq = actual size quantized, asf = actual size float
|
5926
|
-
size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
5927
|
-
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5928
|
-
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5929
|
-
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6360
|
+
char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6361
|
+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
6362
|
+
char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
|
6363
|
+
float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
5930
6364
|
|
5931
|
-
//
|
5932
|
-
|
5933
|
-
|
5934
|
-
|
5935
|
-
|
5936
|
-
}
|
6365
|
+
// as = actual size
|
6366
|
+
size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
|
6367
|
+
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6368
|
+
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
6369
|
+
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5937
6370
|
|
5938
|
-
|
5939
|
-
|
5940
|
-
continue;
|
5941
|
-
}
|
6371
|
+
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6372
|
+
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
5942
6373
|
|
5943
|
-
|
5944
|
-
|
6374
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6375
|
+
// by default, use all rows
|
6376
|
+
row_low[id] = 0;
|
6377
|
+
row_high[id] = ne01;
|
5945
6378
|
|
5946
|
-
|
6379
|
+
// for multi GPU, get the row boundaries from tensor split
|
6380
|
+
// and round to mul_mat_q tile sizes
|
5947
6381
|
if (split) {
|
5948
6382
|
const int64_t rounding = get_row_rounding(src0->type);
|
5949
6383
|
|
5950
|
-
|
5951
|
-
|
6384
|
+
if (id != 0) {
|
6385
|
+
row_low[id] = ne01*g_tensor_split[id];
|
6386
|
+
row_low[id] -= row_low[id] % rounding;
|
6387
|
+
}
|
5952
6388
|
|
5953
|
-
if (id
|
5954
|
-
row_high
|
5955
|
-
|
5956
|
-
row_high = nrows0*g_tensor_split[id + 1];
|
5957
|
-
row_high -= row_high % rounding;
|
6389
|
+
if (id != g_device_count - 1) {
|
6390
|
+
row_high[id] = ne01*g_tensor_split[id + 1];
|
6391
|
+
row_high[id] -= row_high[id] % rounding;
|
5958
6392
|
}
|
5959
|
-
} else {
|
5960
|
-
row_low = 0;
|
5961
|
-
row_high = nrows0*i02_divisor;
|
5962
6393
|
}
|
5963
|
-
|
6394
|
+
}
|
6395
|
+
|
6396
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6397
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
5964
6398
|
continue;
|
5965
6399
|
}
|
5966
6400
|
|
5967
|
-
|
5968
|
-
|
5969
|
-
cudaSetDevice(id);
|
5970
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
6401
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6402
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5971
6403
|
|
5972
|
-
|
5973
|
-
|
5974
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
5975
|
-
}
|
6404
|
+
ggml_cuda_set_device(id);
|
6405
|
+
const cudaStream_t stream = g_cudaStreams[id][0];
|
5976
6406
|
|
5977
6407
|
if (src0_on_device && src0_is_contiguous) {
|
5978
|
-
|
5979
|
-
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
5980
|
-
} else {
|
5981
|
-
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
5982
|
-
}
|
6408
|
+
src0_dd[id] = (char *) src0_extra->data_device[id];
|
5983
6409
|
} else {
|
5984
|
-
|
5985
|
-
|
5986
|
-
} else {
|
5987
|
-
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
5988
|
-
}
|
6410
|
+
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6411
|
+
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
5989
6412
|
}
|
5990
6413
|
|
5991
|
-
if (
|
5992
|
-
|
6414
|
+
if (src1_on_device && src1_is_contiguous) {
|
6415
|
+
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
6416
|
+
} else {
|
6417
|
+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
5993
6418
|
}
|
5994
6419
|
|
5995
|
-
if (
|
5996
|
-
|
5997
|
-
|
5998
|
-
|
5999
|
-
src1_ddf[id]
|
6420
|
+
if (convert_src1_to_q8_1) {
|
6421
|
+
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6422
|
+
|
6423
|
+
if (split && src1_on_device && src1_is_contiguous) {
|
6424
|
+
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6425
|
+
CUDA_CHECK(cudaGetLastError());
|
6000
6426
|
}
|
6001
6427
|
}
|
6428
|
+
|
6002
6429
|
if (dst_on_device) {
|
6003
|
-
|
6430
|
+
dst_dd[id] = (float *) dst_extra->data_device[id];
|
6004
6431
|
} else {
|
6005
|
-
size_t size_dst_ddf = split ?
|
6006
|
-
|
6432
|
+
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
6433
|
+
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
6007
6434
|
}
|
6435
|
+
}
|
6436
|
+
|
6437
|
+
// if multiple devices are used they need to wait for the main device
|
6438
|
+
// here an event is recorded that signals that the main device has finished calculating the input data
|
6439
|
+
if (split && g_device_count > 1) {
|
6440
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6441
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6442
|
+
}
|
6008
6443
|
|
6009
|
-
|
6010
|
-
|
6011
|
-
|
6012
|
-
|
6444
|
+
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6445
|
+
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6446
|
+
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6447
|
+
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
6013
6448
|
|
6014
|
-
|
6449
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6450
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
6451
|
+
continue;
|
6452
|
+
}
|
6015
6453
|
|
6016
|
-
|
6017
|
-
|
6018
|
-
|
6454
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6455
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6456
|
+
const int64_t row_diff = row_high[id] - row_low[id];
|
6019
6457
|
|
6020
|
-
|
6021
|
-
|
6022
|
-
|
6023
|
-
|
6024
|
-
|
6025
|
-
|
6026
|
-
|
6027
|
-
i01_low = row_low % rows_per_iter;
|
6028
|
-
}
|
6029
|
-
if (i0 == i0_offset_high) {
|
6030
|
-
i01_high = row_high % rows_per_iter;
|
6031
|
-
}
|
6032
|
-
}
|
6458
|
+
ggml_cuda_set_device(id);
|
6459
|
+
const cudaStream_t stream = g_cudaStreams[id][is];
|
6460
|
+
|
6461
|
+
// wait for main GPU data if necessary
|
6462
|
+
if (split && (id != g_main_device || is != 0)) {
|
6463
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
|
6464
|
+
}
|
6033
6465
|
|
6034
|
-
|
6035
|
-
|
6036
|
-
|
6037
|
-
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
6038
|
-
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
6039
|
-
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
6466
|
+
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
6467
|
+
const int64_t i03 = i0 / ne12;
|
6468
|
+
const int64_t i02 = i0 % ne12;
|
6040
6469
|
|
6041
|
-
const
|
6042
|
-
if (i01_diff == 0) {
|
6043
|
-
continue;
|
6044
|
-
}
|
6045
|
-
const int64_t i11 = i13*ne12 + i12;
|
6470
|
+
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
6046
6471
|
|
6047
6472
|
// for split tensors the data begins at i0 == i0_offset_low
|
6048
|
-
char *
|
6049
|
-
float *
|
6050
|
-
|
6051
|
-
float *
|
6052
|
-
|
6053
|
-
// for split tensors the data pointer needs to be rounded down
|
6054
|
-
// to the bin edge for i03, i02 bins beyond the first
|
6055
|
-
if (i0 - i0_offset_low > 0) {
|
6056
|
-
GGML_ASSERT(!flatten_rows);
|
6057
|
-
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
6058
|
-
src0_ddf_i -= (row_low % ne01)*ne00;
|
6059
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
6060
|
-
}
|
6473
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
6474
|
+
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
6475
|
+
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
6476
|
+
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
6061
6477
|
|
6062
6478
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
6063
6479
|
// in that case an offset on dst_ddf_i is needed
|
6064
6480
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
6065
|
-
|
6481
|
+
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
6066
6482
|
}
|
6067
6483
|
|
6068
6484
|
// copy src0, src1 to device if necessary
|
6069
|
-
if (
|
6070
|
-
if (
|
6071
|
-
|
6072
|
-
|
6073
|
-
|
6074
|
-
|
6075
|
-
|
6076
|
-
GGML_ASSERT(!flatten_rows);
|
6485
|
+
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
6486
|
+
if (id != g_main_device) {
|
6487
|
+
if (convert_src1_to_q8_1) {
|
6488
|
+
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
6489
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
6490
|
+
cudaMemcpyDeviceToDevice, stream));
|
6491
|
+
} else {
|
6077
6492
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
6078
|
-
src1_ddf_i_source +=
|
6079
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source,
|
6080
|
-
cudaMemcpyDeviceToDevice,
|
6493
|
+
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
6494
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
6495
|
+
cudaMemcpyDeviceToDevice, stream));
|
6081
6496
|
}
|
6082
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
6083
|
-
GGML_ASSERT(!split);
|
6084
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
6085
|
-
} else {
|
6086
|
-
GGML_ASSERT(false);
|
6087
6497
|
}
|
6498
|
+
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
6499
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
6500
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
6501
|
+
} else {
|
6502
|
+
GGML_ASSERT(false);
|
6088
6503
|
}
|
6089
6504
|
|
6090
|
-
if (
|
6091
|
-
|
6092
|
-
|
6093
|
-
} else {
|
6094
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
6095
|
-
}
|
6505
|
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6506
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6507
|
+
CUDA_CHECK(cudaGetLastError());
|
6096
6508
|
}
|
6097
6509
|
|
6098
|
-
|
6099
|
-
|
6100
|
-
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
6101
|
-
CUDA_CHECK(cudaGetLastError());
|
6510
|
+
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
6511
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
6102
6512
|
}
|
6103
6513
|
|
6104
6514
|
// do the computation
|
6105
|
-
op(src0, src1, dst,
|
6515
|
+
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
6516
|
+
row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
|
6106
6517
|
CUDA_CHECK(cudaGetLastError());
|
6107
6518
|
|
6108
6519
|
// copy dst to host or other device if necessary
|
@@ -6124,95 +6535,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
6124
6535
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
6125
6536
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
6126
6537
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
6127
|
-
float * dhf_dst_i = (float *) ((char *) dst_off_device +
|
6128
|
-
|
6129
|
-
|
6538
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6539
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6540
|
+
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
6541
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
6542
|
+
row_diff*sizeof(float), src1_ncols, kind, stream));
|
6130
6543
|
} else {
|
6131
6544
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6132
|
-
|
6545
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6546
|
+
dhf_dst_i += src1_col_0*ne0;
|
6547
|
+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
6133
6548
|
}
|
6134
6549
|
}
|
6135
6550
|
|
6136
|
-
//
|
6137
|
-
if (split &&
|
6138
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[id],
|
6551
|
+
// add event for the main device to wait on until other device is done
|
6552
|
+
if (split && (id != g_main_device || is != 0)) {
|
6553
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
6139
6554
|
}
|
6140
6555
|
}
|
6141
6556
|
}
|
6142
6557
|
}
|
6143
6558
|
|
6144
|
-
|
6145
|
-
|
6146
|
-
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
6147
|
-
continue;
|
6148
|
-
}
|
6149
|
-
|
6150
|
-
CUDA_CHECK(cudaSetDevice(id));
|
6559
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6560
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6151
6561
|
|
6152
|
-
|
6153
|
-
|
6154
|
-
|
6155
|
-
if (src0_asf[id] > 0) {
|
6156
|
-
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
6562
|
+
// free buffers again when done
|
6563
|
+
if (src0_as[id] > 0) {
|
6564
|
+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
6157
6565
|
}
|
6158
6566
|
if (src1_asf[id] > 0) {
|
6159
6567
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
6160
6568
|
}
|
6161
|
-
if (
|
6162
|
-
ggml_cuda_pool_free(
|
6569
|
+
if (src1_asq[id] > 0) {
|
6570
|
+
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
6571
|
+
}
|
6572
|
+
if (dst_as[id] > 0) {
|
6573
|
+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
6163
6574
|
}
|
6164
6575
|
}
|
6165
6576
|
|
6166
6577
|
// main device waits for all other devices to be finished
|
6167
6578
|
if (split && g_device_count > 1) {
|
6168
|
-
|
6169
|
-
|
6170
|
-
|
6171
|
-
|
6579
|
+
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
6580
|
+
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
6581
|
+
|
6582
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6583
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6584
|
+
for (int64_t is = 0; is < is_max; ++is) {
|
6585
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6172
6586
|
}
|
6173
6587
|
}
|
6174
6588
|
}
|
6175
6589
|
|
6176
6590
|
if (dst->backend == GGML_BACKEND_CPU) {
|
6177
|
-
CUDA_CHECK(
|
6591
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6178
6592
|
CUDA_CHECK(cudaDeviceSynchronize());
|
6179
6593
|
}
|
6180
6594
|
}
|
6181
6595
|
|
6182
6596
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6183
|
-
|
6184
|
-
// Due to flatten_rows == true this does in practice not make a difference however.
|
6185
|
-
// Better solution would be nice but right now that would require disproportionate changes.
|
6186
|
-
GGML_ASSERT(
|
6187
|
-
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
6188
|
-
src1->type == GGML_TYPE_F32 &&
|
6189
|
-
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
6190
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
6597
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6191
6598
|
}
|
6192
6599
|
|
6193
6600
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6194
|
-
|
6195
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
6601
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6196
6602
|
}
|
6197
6603
|
|
6198
6604
|
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6199
|
-
|
6200
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
6605
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6201
6606
|
}
|
6202
6607
|
|
6203
6608
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6204
|
-
|
6205
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
6609
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6206
6610
|
}
|
6207
6611
|
|
6208
6612
|
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6209
|
-
|
6210
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
6613
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6211
6614
|
}
|
6212
6615
|
|
6213
6616
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6214
|
-
|
6215
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
6617
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6216
6618
|
}
|
6217
6619
|
|
6218
6620
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -6246,8 +6648,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6246
6648
|
|
6247
6649
|
const int64_t ne12 = src1->ne[2];
|
6248
6650
|
|
6249
|
-
CUDA_CHECK(
|
6250
|
-
cudaStream_t
|
6651
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6652
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6251
6653
|
|
6252
6654
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6253
6655
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6258,7 +6660,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6258
6660
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6259
6661
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6260
6662
|
|
6261
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12,
|
6663
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6262
6664
|
}
|
6263
6665
|
|
6264
6666
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -6277,8 +6679,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6277
6679
|
const int64_t nb01 = src0->nb[1];
|
6278
6680
|
const int64_t nb02 = src0->nb[2];
|
6279
6681
|
|
6280
|
-
CUDA_CHECK(
|
6281
|
-
cudaStream_t
|
6682
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6683
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6282
6684
|
|
6283
6685
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6284
6686
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6289,38 +6691,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6289
6691
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6290
6692
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6291
6693
|
|
6292
|
-
const
|
6293
|
-
const
|
6694
|
+
const int64_t row_stride_x = nb01 / sizeof(half);
|
6695
|
+
const int64_t channel_stride_x = nb02 / sizeof(half);
|
6294
6696
|
|
6295
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,
|
6697
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6296
6698
|
}
|
6297
6699
|
|
6298
6700
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6299
6701
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6300
6702
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6301
6703
|
|
6704
|
+
int64_t min_compute_capability = INT_MAX;
|
6705
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6706
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6707
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6708
|
+
min_compute_capability = g_compute_capabilities[id];
|
6709
|
+
}
|
6710
|
+
}
|
6711
|
+
|
6302
6712
|
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6303
6713
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6304
6714
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6305
6715
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6306
6716
|
}else if (src0->type == GGML_TYPE_F32) {
|
6307
|
-
|
6717
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6308
6718
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6309
6719
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
6310
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
6311
|
-
} else {
|
6312
|
-
int min_compute_capability = INT_MAX;
|
6313
|
-
for (int id = 0; id < g_device_count; ++id) {
|
6314
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6315
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6316
|
-
min_compute_capability = g_compute_capabilities[id];
|
6317
|
-
}
|
6318
|
-
}
|
6319
6720
|
|
6721
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
6722
|
+
const bool use_mul_mat_vec_q = false;
|
6723
|
+
#else
|
6724
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
6725
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
6726
|
+
|
6727
|
+
if (use_mul_mat_vec_q) {
|
6728
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
6729
|
+
} else {
|
6730
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
6731
|
+
}
|
6732
|
+
} else {
|
6320
6733
|
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
6321
|
-
|
6734
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
6322
6735
|
} else {
|
6323
|
-
|
6736
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6324
6737
|
}
|
6325
6738
|
}
|
6326
6739
|
} else {
|
@@ -6329,8 +6742,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6329
6742
|
}
|
6330
6743
|
|
6331
6744
|
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6332
|
-
|
6333
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
6745
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6334
6746
|
}
|
6335
6747
|
|
6336
6748
|
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6359,8 +6771,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6359
6771
|
const int64_t nb11 = src1->nb[1];
|
6360
6772
|
const int64_t nb12 = src1->nb[2];
|
6361
6773
|
|
6362
|
-
CUDA_CHECK(
|
6363
|
-
cudaStream_t
|
6774
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6775
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6364
6776
|
|
6365
6777
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6366
6778
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -6370,10 +6782,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6370
6782
|
|
6371
6783
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
6372
6784
|
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6373
|
-
ne10, ne11, nb10, nb11, nb12,
|
6785
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6374
6786
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
6375
6787
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6376
|
-
ne10, ne11, nb10, nb11, nb12,
|
6788
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6377
6789
|
} else {
|
6378
6790
|
GGML_ASSERT(false);
|
6379
6791
|
}
|
@@ -6387,28 +6799,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6387
6799
|
}
|
6388
6800
|
|
6389
6801
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6390
|
-
|
6391
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
6802
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6392
6803
|
}
|
6393
6804
|
|
6394
6805
|
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6395
|
-
|
6396
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
6806
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6397
6807
|
}
|
6398
6808
|
|
6399
6809
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6400
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6401
6810
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6402
|
-
|
6403
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
6404
|
-
const bool is_glm = mode & 4;
|
6405
|
-
|
6406
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6811
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6407
6812
|
}
|
6408
6813
|
|
6409
6814
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6410
|
-
|
6411
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6815
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6412
6816
|
}
|
6413
6817
|
|
6414
6818
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6418,7 +6822,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6418
6822
|
}
|
6419
6823
|
|
6420
6824
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
6421
|
-
|
6825
|
+
const int64_t nrows = ggml_nrows(tensor);
|
6422
6826
|
|
6423
6827
|
const int64_t ne0 = tensor->ne[0];
|
6424
6828
|
|
@@ -6428,14 +6832,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6428
6832
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6429
6833
|
memset(extra, 0, sizeof(*extra));
|
6430
6834
|
|
6431
|
-
for (
|
6835
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6432
6836
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
6433
6837
|
continue;
|
6434
6838
|
}
|
6435
6839
|
|
6436
|
-
|
6840
|
+
ggml_cuda_set_device(id);
|
6437
6841
|
|
6438
|
-
|
6842
|
+
int64_t row_low, row_high;
|
6439
6843
|
if (backend == GGML_BACKEND_GPU) {
|
6440
6844
|
row_low = 0;
|
6441
6845
|
row_high = nrows;
|
@@ -6485,7 +6889,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6485
6889
|
extra->data_device[id] = buf;
|
6486
6890
|
|
6487
6891
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6488
|
-
|
6892
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6893
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
6894
|
+
}
|
6489
6895
|
}
|
6490
6896
|
}
|
6491
6897
|
|
@@ -6499,15 +6905,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
6499
6905
|
|
6500
6906
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6501
6907
|
|
6502
|
-
for (
|
6908
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6503
6909
|
if (extra->data_device[id] != nullptr) {
|
6504
|
-
CUDA_CHECK(
|
6910
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6505
6911
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
6506
6912
|
}
|
6507
6913
|
|
6508
|
-
|
6509
|
-
|
6510
|
-
|
6914
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6915
|
+
if (extra->events[id][is] != nullptr) {
|
6916
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6917
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
6918
|
+
}
|
6511
6919
|
}
|
6512
6920
|
}
|
6513
6921
|
|
@@ -6559,7 +6967,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6559
6967
|
force_inplace;
|
6560
6968
|
const size_t size = ggml_nbytes(tensor);
|
6561
6969
|
|
6562
|
-
CUDA_CHECK(
|
6970
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6563
6971
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6564
6972
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6565
6973
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
@@ -6608,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
6608
7016
|
return;
|
6609
7017
|
}
|
6610
7018
|
if (g_scratch_buffer == nullptr) {
|
7019
|
+
ggml_cuda_set_device(g_main_device);
|
6611
7020
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6612
7021
|
}
|
6613
7022
|
|
@@ -6647,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
|
6647
7056
|
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6648
7057
|
}
|
6649
7058
|
|
6650
|
-
void ggml_cuda_set_main_device(int main_device) {
|
7059
|
+
void ggml_cuda_set_main_device(const int main_device) {
|
6651
7060
|
if (main_device >= g_device_count) {
|
6652
7061
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
6653
7062
|
main_device, g_device_count, g_main_device);
|
@@ -6661,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
6661
7070
|
}
|
6662
7071
|
}
|
6663
7072
|
|
6664
|
-
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
7073
|
+
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
6665
7074
|
g_mul_mat_q = mul_mat_q;
|
6666
7075
|
}
|
6667
7076
|
|
6668
|
-
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
7077
|
+
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
6669
7078
|
g_scratch_size = scratch_size;
|
6670
7079
|
}
|
6671
7080
|
|