llama_cpp 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -13,7 +13,7 @@
|
|
13
13
|
#ifdef __HIP_PLATFORM_AMD__
|
14
14
|
// for rocblas_initialize()
|
15
15
|
#include "rocblas/rocblas.h"
|
16
|
-
#endif
|
16
|
+
#endif // __HIP_PLATFORM_AMD__
|
17
17
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
18
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
19
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
@@ -68,19 +68,29 @@
|
|
68
68
|
#include <cuda_runtime.h>
|
69
69
|
#include <cublas_v2.h>
|
70
70
|
#include <cuda_fp16.h>
|
71
|
-
#endif
|
71
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
72
72
|
|
73
73
|
#include "ggml-cuda.h"
|
74
74
|
#include "ggml.h"
|
75
75
|
|
76
|
-
#define MIN_CC_DP4A
|
77
|
-
#
|
78
|
-
#define
|
79
|
-
#
|
76
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#define CC_TURING 700
|
78
|
+
#define CC_OFFSET_AMD 1000000
|
79
|
+
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
80
80
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
85
|
+
defined(__gfx1150__) || defined(__gfx1151__)
|
86
|
+
#define RDNA3
|
87
|
+
#endif
|
88
|
+
|
89
|
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
90
|
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
91
|
+
#define RDNA2
|
92
|
+
#endif
|
93
|
+
|
84
94
|
#ifndef __has_builtin
|
85
95
|
#define __has_builtin(x) 0
|
86
96
|
#endif
|
@@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
132
142
|
#endif
|
133
143
|
return c;
|
134
144
|
}
|
135
|
-
#endif
|
145
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
136
146
|
|
137
147
|
#if defined(_MSC_VER)
|
138
148
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -144,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
144
154
|
do { \
|
145
155
|
cudaError_t err_ = (err); \
|
146
156
|
if (err_ != cudaSuccess) { \
|
147
|
-
|
157
|
+
int id; \
|
158
|
+
cudaGetDevice(&id); \
|
159
|
+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
148
160
|
cudaGetErrorString(err_)); \
|
161
|
+
fprintf(stderr, "current device: %d\n", id); \
|
149
162
|
exit(1); \
|
150
163
|
} \
|
151
164
|
} while (0)
|
@@ -155,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
155
168
|
do { \
|
156
169
|
cublasStatus_t err_ = (err); \
|
157
170
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
171
|
+
int id; \
|
172
|
+
cudaGetDevice(&id); \
|
158
173
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
159
174
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
175
|
+
fprintf(stderr, "current device: %d\n", id); \
|
160
176
|
exit(1); \
|
161
177
|
} \
|
162
178
|
} while (0)
|
@@ -165,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
165
181
|
do { \
|
166
182
|
cublasStatus_t err_ = (err); \
|
167
183
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
184
|
+
int id; \
|
185
|
+
cudaGetDevice(&id); \
|
168
186
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
187
|
+
fprintf(stderr, "current device: %d\n", id); \
|
169
188
|
exit(1); \
|
170
189
|
} \
|
171
190
|
} while (0)
|
@@ -212,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
|
|
212
231
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
213
232
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
214
233
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
215
|
-
typedef void (*
|
216
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
217
|
-
|
218
|
-
cudaStream_t &
|
234
|
+
typedef void (*ggml_cuda_op_mul_mat_t)(
|
235
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
236
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
237
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream);
|
238
|
+
typedef void (*ggml_cuda_op_flatten_t)(
|
239
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
240
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
|
219
241
|
|
220
242
|
// QK = number of values after dequantization
|
221
243
|
// QR = QK / number of values before dequantization
|
@@ -396,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
396
418
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
397
419
|
#endif
|
398
420
|
|
421
|
+
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
|
+
|
423
|
+
#define MAX_STREAMS 8
|
424
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
425
|
+
|
399
426
|
struct ggml_tensor_extra_gpu {
|
400
427
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
401
|
-
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
428
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
|
402
429
|
};
|
403
430
|
|
431
|
+
// this is faster on Windows
|
432
|
+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
433
|
+
inline cudaError_t ggml_cuda_set_device(const int device) {
|
434
|
+
int current_device;
|
435
|
+
CUDA_CHECK(cudaGetDevice(¤t_device));
|
436
|
+
|
437
|
+
if (device == current_device) {
|
438
|
+
return cudaSuccess;
|
439
|
+
}
|
440
|
+
|
441
|
+
return cudaSetDevice(device);
|
442
|
+
}
|
443
|
+
|
404
444
|
static int g_device_count = -1;
|
405
445
|
static int g_main_device = 0;
|
406
446
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
@@ -413,8 +453,6 @@ static size_t g_scratch_offset = 0;
|
|
413
453
|
|
414
454
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
415
455
|
|
416
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
417
|
-
|
418
456
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
419
457
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
420
458
|
|
@@ -3444,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3444
3482
|
}
|
3445
3483
|
}
|
3446
3484
|
|
3485
|
+
#define MMQ_X_Q4_0_RDNA2 64
|
3486
|
+
#define MMQ_Y_Q4_0_RDNA2 128
|
3487
|
+
#define NWARPS_Q4_0_RDNA2 8
|
3488
|
+
#define MMQ_X_Q4_0_RDNA1 64
|
3489
|
+
#define MMQ_Y_Q4_0_RDNA1 64
|
3490
|
+
#define NWARPS_Q4_0_RDNA1 8
|
3447
3491
|
#define MMQ_X_Q4_0_AMPERE 64
|
3448
3492
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3449
3493
|
#define NWARPS_Q4_0_AMPERE 4
|
@@ -3451,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3451
3495
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3452
3496
|
#define NWARPS_Q4_0_PASCAL 8
|
3453
3497
|
|
3454
|
-
template <bool need_check> static __global__ void
|
3498
|
+
template <bool need_check> static __global__ void
|
3499
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3500
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3501
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
3502
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3503
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3504
|
+
mul_mat_q4_0(
|
3455
3505
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3456
3506
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3457
3507
|
|
3458
|
-
#if
|
3508
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3509
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3510
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
3511
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
3512
|
+
const int nwarps = NWARPS_Q4_0_RDNA2;
|
3513
|
+
#else
|
3514
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
3515
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
3516
|
+
const int nwarps = NWARPS_Q4_0_RDNA1;
|
3517
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3518
|
+
|
3519
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3520
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3521
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3522
|
+
|
3523
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3459
3524
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3460
3525
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3461
3526
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3478,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3478
3543
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3479
3544
|
}
|
3480
3545
|
|
3546
|
+
#define MMQ_X_Q4_1_RDNA2 64
|
3547
|
+
#define MMQ_Y_Q4_1_RDNA2 128
|
3548
|
+
#define NWARPS_Q4_1_RDNA2 8
|
3549
|
+
#define MMQ_X_Q4_1_RDNA1 64
|
3550
|
+
#define MMQ_Y_Q4_1_RDNA1 64
|
3551
|
+
#define NWARPS_Q4_1_RDNA1 8
|
3481
3552
|
#define MMQ_X_Q4_1_AMPERE 64
|
3482
3553
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3483
3554
|
#define NWARPS_Q4_1_AMPERE 4
|
@@ -3486,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3486
3557
|
#define NWARPS_Q4_1_PASCAL 8
|
3487
3558
|
|
3488
3559
|
template <bool need_check> static __global__ void
|
3489
|
-
#if
|
3560
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3561
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3562
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3563
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3564
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3490
3565
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3491
3566
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3492
3567
|
mul_mat_q4_1(
|
3493
3568
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3494
3569
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3495
3570
|
|
3496
|
-
#if
|
3571
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3572
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3573
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
3574
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
3575
|
+
const int nwarps = NWARPS_Q4_1_RDNA2;
|
3576
|
+
#else
|
3577
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
3578
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
3579
|
+
const int nwarps = NWARPS_Q4_1_RDNA1;
|
3580
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3581
|
+
|
3582
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3583
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3584
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3585
|
+
|
3586
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3497
3587
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3498
3588
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3499
3589
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3516,6 +3606,12 @@ template <bool need_check> static __global__ void
|
|
3516
3606
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3517
3607
|
}
|
3518
3608
|
|
3609
|
+
#define MMQ_X_Q5_0_RDNA2 64
|
3610
|
+
#define MMQ_Y_Q5_0_RDNA2 128
|
3611
|
+
#define NWARPS_Q5_0_RDNA2 8
|
3612
|
+
#define MMQ_X_Q5_0_RDNA1 64
|
3613
|
+
#define MMQ_Y_Q5_0_RDNA1 64
|
3614
|
+
#define NWARPS_Q5_0_RDNA1 8
|
3519
3615
|
#define MMQ_X_Q5_0_AMPERE 128
|
3520
3616
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3521
3617
|
#define NWARPS_Q5_0_AMPERE 4
|
@@ -3523,11 +3619,32 @@ template <bool need_check> static __global__ void
|
|
3523
3619
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3524
3620
|
#define NWARPS_Q5_0_PASCAL 8
|
3525
3621
|
|
3526
|
-
template <bool need_check> static __global__ void
|
3622
|
+
template <bool need_check> static __global__ void
|
3623
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3624
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3625
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
3626
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3627
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3628
|
+
mul_mat_q5_0(
|
3527
3629
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3528
3630
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3529
3631
|
|
3530
|
-
#if
|
3632
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3633
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3634
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
3635
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
3636
|
+
const int nwarps = NWARPS_Q5_0_RDNA2;
|
3637
|
+
#else
|
3638
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
3639
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
3640
|
+
const int nwarps = NWARPS_Q5_0_RDNA1;
|
3641
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3642
|
+
|
3643
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3644
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3645
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3646
|
+
|
3647
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3531
3648
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3532
3649
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3533
3650
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3550,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3550
3667
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3551
3668
|
}
|
3552
3669
|
|
3670
|
+
#define MMQ_X_Q5_1_RDNA2 64
|
3671
|
+
#define MMQ_Y_Q5_1_RDNA2 128
|
3672
|
+
#define NWARPS_Q5_1_RDNA2 8
|
3673
|
+
#define MMQ_X_Q5_1_RDNA1 64
|
3674
|
+
#define MMQ_Y_Q5_1_RDNA1 64
|
3675
|
+
#define NWARPS_Q5_1_RDNA1 8
|
3553
3676
|
#define MMQ_X_Q5_1_AMPERE 128
|
3554
3677
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3555
3678
|
#define NWARPS_Q5_1_AMPERE 4
|
@@ -3557,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3557
3680
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3558
3681
|
#define NWARPS_Q5_1_PASCAL 8
|
3559
3682
|
|
3560
|
-
template <bool need_check> static __global__ void
|
3683
|
+
template <bool need_check> static __global__ void
|
3684
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3685
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3686
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
3687
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3688
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3689
|
+
mul_mat_q5_1(
|
3561
3690
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3562
3691
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3563
3692
|
|
3564
|
-
#if
|
3693
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3694
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3695
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
3696
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
3697
|
+
const int nwarps = NWARPS_Q5_1_RDNA2;
|
3698
|
+
#else
|
3699
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
3700
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
3701
|
+
const int nwarps = NWARPS_Q5_1_RDNA1;
|
3702
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3703
|
+
|
3704
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3705
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3706
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3707
|
+
|
3708
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3565
3709
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3566
3710
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3567
3711
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3584,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3584
3728
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3585
3729
|
}
|
3586
3730
|
|
3731
|
+
#define MMQ_X_Q8_0_RDNA2 64
|
3732
|
+
#define MMQ_Y_Q8_0_RDNA2 128
|
3733
|
+
#define NWARPS_Q8_0_RDNA2 8
|
3734
|
+
#define MMQ_X_Q8_0_RDNA1 64
|
3735
|
+
#define MMQ_Y_Q8_0_RDNA1 64
|
3736
|
+
#define NWARPS_Q8_0_RDNA1 8
|
3587
3737
|
#define MMQ_X_Q8_0_AMPERE 128
|
3588
3738
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3589
3739
|
#define NWARPS_Q8_0_AMPERE 4
|
@@ -3591,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3591
3741
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3592
3742
|
#define NWARPS_Q8_0_PASCAL 8
|
3593
3743
|
|
3594
|
-
template <bool need_check> static __global__ void
|
3744
|
+
template <bool need_check> static __global__ void
|
3745
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3746
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3747
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
3748
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3749
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3750
|
+
mul_mat_q8_0(
|
3595
3751
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3596
3752
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3597
3753
|
|
3598
|
-
#if
|
3754
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3755
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3756
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
3757
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
3758
|
+
const int nwarps = NWARPS_Q8_0_RDNA2;
|
3759
|
+
#else
|
3760
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
3761
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
3762
|
+
const int nwarps = NWARPS_Q8_0_RDNA1;
|
3763
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3764
|
+
|
3765
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3766
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3767
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3768
|
+
|
3769
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3599
3770
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3600
3771
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3601
3772
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3618,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3618
3789
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3619
3790
|
}
|
3620
3791
|
|
3792
|
+
#define MMQ_X_Q2_K_RDNA2 64
|
3793
|
+
#define MMQ_Y_Q2_K_RDNA2 128
|
3794
|
+
#define NWARPS_Q2_K_RDNA2 8
|
3795
|
+
#define MMQ_X_Q2_K_RDNA1 128
|
3796
|
+
#define MMQ_Y_Q2_K_RDNA1 32
|
3797
|
+
#define NWARPS_Q2_K_RDNA1 8
|
3621
3798
|
#define MMQ_X_Q2_K_AMPERE 64
|
3622
3799
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3623
3800
|
#define NWARPS_Q2_K_AMPERE 4
|
@@ -3625,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3625
3802
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3626
3803
|
#define NWARPS_Q2_K_PASCAL 8
|
3627
3804
|
|
3628
|
-
template <bool need_check> static __global__ void
|
3805
|
+
template <bool need_check> static __global__ void
|
3806
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3807
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3808
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
3809
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3810
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3811
|
+
mul_mat_q2_K(
|
3629
3812
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3630
3813
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3631
3814
|
|
3632
|
-
#if
|
3815
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3816
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3817
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
3818
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
3819
|
+
const int nwarps = NWARPS_Q2_K_RDNA2;
|
3820
|
+
#else
|
3821
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
3822
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
3823
|
+
const int nwarps = NWARPS_Q2_K_RDNA1;
|
3824
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3825
|
+
|
3826
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3827
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3828
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3829
|
+
|
3830
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3633
3831
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3634
3832
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3635
3833
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3652,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3652
3850
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3653
3851
|
}
|
3654
3852
|
|
3853
|
+
#define MMQ_X_Q3_K_RDNA2 128
|
3854
|
+
#define MMQ_Y_Q3_K_RDNA2 64
|
3855
|
+
#define NWARPS_Q3_K_RDNA2 8
|
3856
|
+
#define MMQ_X_Q3_K_RDNA1 32
|
3857
|
+
#define MMQ_Y_Q3_K_RDNA1 128
|
3858
|
+
#define NWARPS_Q3_K_RDNA1 8
|
3655
3859
|
#define MMQ_X_Q3_K_AMPERE 128
|
3656
3860
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3657
3861
|
#define NWARPS_Q3_K_AMPERE 4
|
@@ -3660,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3660
3864
|
#define NWARPS_Q3_K_PASCAL 8
|
3661
3865
|
|
3662
3866
|
template <bool need_check> static __global__ void
|
3663
|
-
#if
|
3867
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3868
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3869
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3870
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3871
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3664
3872
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3665
3873
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3666
3874
|
mul_mat_q3_K(
|
3667
3875
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3668
3876
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3669
3877
|
|
3670
|
-
#if
|
3878
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3879
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3880
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
3881
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
3882
|
+
const int nwarps = NWARPS_Q3_K_RDNA2;
|
3883
|
+
#else
|
3884
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
3885
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
3886
|
+
const int nwarps = NWARPS_Q3_K_RDNA1;
|
3887
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3888
|
+
|
3889
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3890
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3891
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3892
|
+
|
3893
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3671
3894
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3672
3895
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3673
3896
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3690,6 +3913,12 @@ template <bool need_check> static __global__ void
|
|
3690
3913
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3691
3914
|
}
|
3692
3915
|
|
3916
|
+
#define MMQ_X_Q4_K_RDNA2 64
|
3917
|
+
#define MMQ_Y_Q4_K_RDNA2 128
|
3918
|
+
#define NWARPS_Q4_K_RDNA2 8
|
3919
|
+
#define MMQ_X_Q4_K_RDNA1 32
|
3920
|
+
#define MMQ_Y_Q4_K_RDNA1 64
|
3921
|
+
#define NWARPS_Q4_K_RDNA1 8
|
3693
3922
|
#define MMQ_X_Q4_K_AMPERE 64
|
3694
3923
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3695
3924
|
#define NWARPS_Q4_K_AMPERE 4
|
@@ -3698,14 +3927,33 @@ template <bool need_check> static __global__ void
|
|
3698
3927
|
#define NWARPS_Q4_K_PASCAL 8
|
3699
3928
|
|
3700
3929
|
template <bool need_check> static __global__ void
|
3701
|
-
#if
|
3930
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3931
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3932
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3933
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3934
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3702
3935
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3703
3936
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3704
3937
|
mul_mat_q4_K(
|
3705
3938
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3706
3939
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3707
3940
|
|
3708
|
-
#if
|
3941
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3942
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3943
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
3944
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
3945
|
+
const int nwarps = NWARPS_Q4_K_RDNA2;
|
3946
|
+
#else
|
3947
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
3948
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
3949
|
+
const int nwarps = NWARPS_Q4_K_RDNA1;
|
3950
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3951
|
+
|
3952
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3953
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3954
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3955
|
+
|
3956
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3709
3957
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3710
3958
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3711
3959
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3728,6 +3976,12 @@ template <bool need_check> static __global__ void
|
|
3728
3976
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3729
3977
|
}
|
3730
3978
|
|
3979
|
+
#define MMQ_X_Q5_K_RDNA2 64
|
3980
|
+
#define MMQ_Y_Q5_K_RDNA2 128
|
3981
|
+
#define NWARPS_Q5_K_RDNA2 8
|
3982
|
+
#define MMQ_X_Q5_K_RDNA1 32
|
3983
|
+
#define MMQ_Y_Q5_K_RDNA1 64
|
3984
|
+
#define NWARPS_Q5_K_RDNA1 8
|
3731
3985
|
#define MMQ_X_Q5_K_AMPERE 64
|
3732
3986
|
#define MMQ_Y_Q5_K_AMPERE 128
|
3733
3987
|
#define NWARPS_Q5_K_AMPERE 4
|
@@ -3735,11 +3989,32 @@ template <bool need_check> static __global__ void
|
|
3735
3989
|
#define MMQ_Y_Q5_K_PASCAL 64
|
3736
3990
|
#define NWARPS_Q5_K_PASCAL 8
|
3737
3991
|
|
3738
|
-
template <bool need_check> static __global__ void
|
3992
|
+
template <bool need_check> static __global__ void
|
3993
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3994
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3995
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
3996
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3997
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3998
|
+
mul_mat_q5_K(
|
3739
3999
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3740
4000
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3741
4001
|
|
3742
|
-
#if
|
4002
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4003
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4004
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
4005
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
4006
|
+
const int nwarps = NWARPS_Q5_K_RDNA2;
|
4007
|
+
#else
|
4008
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
4009
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
4010
|
+
const int nwarps = NWARPS_Q5_K_RDNA1;
|
4011
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4012
|
+
|
4013
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4014
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4015
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4016
|
+
|
4017
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3743
4018
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3744
4019
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3745
4020
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -3762,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3762
4037
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3763
4038
|
}
|
3764
4039
|
|
4040
|
+
#define MMQ_X_Q6_K_RDNA2 64
|
4041
|
+
#define MMQ_Y_Q6_K_RDNA2 128
|
4042
|
+
#define NWARPS_Q6_K_RDNA2 8
|
4043
|
+
#define MMQ_X_Q6_K_RDNA1 32
|
4044
|
+
#define MMQ_Y_Q6_K_RDNA1 64
|
4045
|
+
#define NWARPS_Q6_K_RDNA1 8
|
3765
4046
|
#define MMQ_X_Q6_K_AMPERE 64
|
3766
4047
|
#define MMQ_Y_Q6_K_AMPERE 64
|
3767
4048
|
#define NWARPS_Q6_K_AMPERE 4
|
@@ -3770,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3770
4051
|
#define NWARPS_Q6_K_PASCAL 8
|
3771
4052
|
|
3772
4053
|
template <bool need_check> static __global__ void
|
3773
|
-
#if
|
4054
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4055
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4056
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4057
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4058
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3774
4059
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3775
4060
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3776
4061
|
mul_mat_q6_K(
|
3777
4062
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3778
4063
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3779
4064
|
|
3780
|
-
#if
|
4065
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4066
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4067
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
4068
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
4069
|
+
const int nwarps = NWARPS_Q6_K_RDNA2;
|
4070
|
+
#else
|
4071
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
4072
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
4073
|
+
const int nwarps = NWARPS_Q6_K_RDNA1;
|
4074
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4075
|
+
|
4076
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4077
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4078
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4079
|
+
|
4080
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3781
4081
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3782
4082
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3783
4083
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4086,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4086
4386
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4087
4387
|
}
|
4088
4388
|
|
4089
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4389
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
4390
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4090
4391
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4091
4392
|
const int half_n_dims = ncols/4;
|
4092
4393
|
|
@@ -4098,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4098
4399
|
const int i = row*ncols + col;
|
4099
4400
|
|
4100
4401
|
const float col_theta_scale = powf(theta_scale, col);
|
4402
|
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
4101
4403
|
|
4102
|
-
const float theta = p*col_theta_scale;
|
4404
|
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
4103
4405
|
const float sin_theta = sinf(theta);
|
4104
4406
|
const float cos_theta = cosf(theta);
|
4105
4407
|
|
@@ -4109,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4109
4411
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4110
4412
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4111
4413
|
|
4112
|
-
const float block_theta =
|
4414
|
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
4113
4415
|
const float sin_block_theta = sinf(block_theta);
|
4114
4416
|
const float cos_block_theta = cosf(block_theta);
|
4115
4417
|
|
@@ -4558,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4558
4860
|
const int compute_capability = g_compute_capabilities[id];
|
4559
4861
|
|
4560
4862
|
int mmq_x, mmq_y, nwarps;
|
4561
|
-
if (compute_capability >=
|
4863
|
+
if (compute_capability >= CC_RDNA2) {
|
4864
|
+
mmq_x = MMQ_X_Q4_0_RDNA2;
|
4865
|
+
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
4866
|
+
nwarps = NWARPS_Q4_0_RDNA2;
|
4867
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4868
|
+
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4869
|
+
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4870
|
+
nwarps = NWARPS_Q4_0_RDNA1;
|
4871
|
+
} else if (compute_capability >= CC_TURING) {
|
4562
4872
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4563
4873
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4564
4874
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4595,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4595
4905
|
const int compute_capability = g_compute_capabilities[id];
|
4596
4906
|
|
4597
4907
|
int mmq_x, mmq_y, nwarps;
|
4598
|
-
if (compute_capability >=
|
4908
|
+
if (compute_capability >= CC_RDNA2) {
|
4909
|
+
mmq_x = MMQ_X_Q4_1_RDNA2;
|
4910
|
+
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
4911
|
+
nwarps = NWARPS_Q4_1_RDNA2;
|
4912
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4913
|
+
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4914
|
+
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4915
|
+
nwarps = NWARPS_Q4_1_RDNA1;
|
4916
|
+
} else if (compute_capability >= CC_TURING) {
|
4599
4917
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4600
4918
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4601
4919
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4632,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4632
4950
|
const int compute_capability = g_compute_capabilities[id];
|
4633
4951
|
|
4634
4952
|
int mmq_x, mmq_y, nwarps;
|
4635
|
-
if (compute_capability >=
|
4953
|
+
if (compute_capability >= CC_RDNA2) {
|
4954
|
+
mmq_x = MMQ_X_Q5_0_RDNA2;
|
4955
|
+
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
4956
|
+
nwarps = NWARPS_Q5_0_RDNA2;
|
4957
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4958
|
+
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4959
|
+
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4960
|
+
nwarps = NWARPS_Q5_0_RDNA1;
|
4961
|
+
} else if (compute_capability >= CC_TURING) {
|
4636
4962
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4637
4963
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4638
4964
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -4669,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4669
4995
|
const int compute_capability = g_compute_capabilities[id];
|
4670
4996
|
|
4671
4997
|
int mmq_x, mmq_y, nwarps;
|
4672
|
-
if (compute_capability >=
|
4998
|
+
if (compute_capability >= CC_RDNA2) {
|
4999
|
+
mmq_x = MMQ_X_Q5_1_RDNA2;
|
5000
|
+
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
5001
|
+
nwarps = NWARPS_Q5_1_RDNA2;
|
5002
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5003
|
+
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5004
|
+
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5005
|
+
nwarps = NWARPS_Q5_1_RDNA1;
|
5006
|
+
} else if (compute_capability >= CC_TURING) {
|
4673
5007
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4674
5008
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4675
5009
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -4706,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4706
5040
|
const int compute_capability = g_compute_capabilities[id];
|
4707
5041
|
|
4708
5042
|
int mmq_x, mmq_y, nwarps;
|
4709
|
-
if (compute_capability >=
|
5043
|
+
if (compute_capability >= CC_RDNA2) {
|
5044
|
+
mmq_x = MMQ_X_Q8_0_RDNA2;
|
5045
|
+
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
5046
|
+
nwarps = NWARPS_Q8_0_RDNA2;
|
5047
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5048
|
+
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5049
|
+
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5050
|
+
nwarps = NWARPS_Q8_0_RDNA1;
|
5051
|
+
} else if (compute_capability >= CC_TURING) {
|
4710
5052
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4711
5053
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4712
5054
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -4743,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4743
5085
|
const int compute_capability = g_compute_capabilities[id];
|
4744
5086
|
|
4745
5087
|
int mmq_x, mmq_y, nwarps;
|
4746
|
-
if (compute_capability >=
|
5088
|
+
if (compute_capability >= CC_RDNA2) {
|
5089
|
+
mmq_x = MMQ_X_Q2_K_RDNA2;
|
5090
|
+
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
5091
|
+
nwarps = NWARPS_Q2_K_RDNA2;
|
5092
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5093
|
+
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5094
|
+
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5095
|
+
nwarps = NWARPS_Q2_K_RDNA1;
|
5096
|
+
} else if (compute_capability >= CC_TURING) {
|
4747
5097
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4748
5098
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4749
5099
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -4782,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4782
5132
|
const int compute_capability = g_compute_capabilities[id];
|
4783
5133
|
|
4784
5134
|
int mmq_x, mmq_y, nwarps;
|
4785
|
-
if (compute_capability >=
|
5135
|
+
if (compute_capability >= CC_RDNA2) {
|
5136
|
+
mmq_x = MMQ_X_Q3_K_RDNA2;
|
5137
|
+
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
5138
|
+
nwarps = NWARPS_Q3_K_RDNA2;
|
5139
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5140
|
+
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5141
|
+
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5142
|
+
nwarps = NWARPS_Q3_K_RDNA1;
|
5143
|
+
} else if (compute_capability >= CC_TURING) {
|
4786
5144
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4787
5145
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4788
5146
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -4820,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4820
5178
|
const int compute_capability = g_compute_capabilities[id];
|
4821
5179
|
|
4822
5180
|
int mmq_x, mmq_y, nwarps;
|
4823
|
-
if (compute_capability >=
|
5181
|
+
if (compute_capability >= CC_RDNA2) {
|
5182
|
+
mmq_x = MMQ_X_Q4_K_RDNA2;
|
5183
|
+
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
5184
|
+
nwarps = NWARPS_Q4_K_RDNA2;
|
5185
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5186
|
+
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5187
|
+
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5188
|
+
nwarps = NWARPS_Q4_K_RDNA1;
|
5189
|
+
} else if (compute_capability >= CC_TURING) {
|
4824
5190
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4825
5191
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4826
5192
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4857,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4857
5223
|
const int compute_capability = g_compute_capabilities[id];
|
4858
5224
|
|
4859
5225
|
int mmq_x, mmq_y, nwarps;
|
4860
|
-
if (compute_capability >=
|
5226
|
+
if (compute_capability >= CC_RDNA2) {
|
5227
|
+
mmq_x = MMQ_X_Q5_K_RDNA2;
|
5228
|
+
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
5229
|
+
nwarps = NWARPS_Q5_K_RDNA2;
|
5230
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5231
|
+
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5232
|
+
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5233
|
+
nwarps = NWARPS_Q5_K_RDNA1;
|
5234
|
+
} else if (compute_capability >= CC_TURING) {
|
4861
5235
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4862
5236
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4863
5237
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4894,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4894
5268
|
const int compute_capability = g_compute_capabilities[id];
|
4895
5269
|
|
4896
5270
|
int mmq_x, mmq_y, nwarps;
|
4897
|
-
if (compute_capability >=
|
5271
|
+
if (compute_capability >= CC_RDNA2) {
|
5272
|
+
mmq_x = MMQ_X_Q6_K_RDNA2;
|
5273
|
+
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
5274
|
+
nwarps = NWARPS_Q6_K_RDNA2;
|
5275
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5276
|
+
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5277
|
+
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5278
|
+
nwarps = NWARPS_Q6_K_RDNA1;
|
5279
|
+
} else if (compute_capability >= CC_TURING) {
|
4898
5280
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4899
5281
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4900
5282
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4984,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
4984
5366
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4985
5367
|
}
|
4986
5368
|
|
4987
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4988
|
-
|
4989
|
-
|
4990
|
-
const
|
5369
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
5370
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5371
|
+
GGML_ASSERT(ncols % 4 == 0);
|
5372
|
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5373
|
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
4991
5374
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4992
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5375
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
4993
5376
|
}
|
4994
5377
|
|
4995
5378
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5127,25 +5510,30 @@ void ggml_init_cublas() {
|
|
5127
5510
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5128
5511
|
int64_t total_vram = 0;
|
5129
5512
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5130
|
-
for (
|
5513
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5131
5514
|
cudaDeviceProp prop;
|
5132
5515
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5133
|
-
fprintf(stderr, " Device %
|
5516
|
+
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5134
5517
|
|
5135
5518
|
g_tensor_split[id] = total_vram;
|
5136
5519
|
total_vram += prop.totalGlobalMem;
|
5137
|
-
|
5520
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5521
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
5522
|
+
#else
|
5138
5523
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5524
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5139
5525
|
}
|
5140
|
-
for (
|
5526
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5141
5527
|
g_tensor_split[id] /= total_vram;
|
5142
5528
|
}
|
5143
5529
|
|
5144
|
-
for (
|
5145
|
-
CUDA_CHECK(
|
5530
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5531
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
5146
5532
|
|
5147
|
-
// create
|
5148
|
-
|
5533
|
+
// create cuda streams
|
5534
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
5535
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5536
|
+
}
|
5149
5537
|
|
5150
5538
|
// create cublas handle
|
5151
5539
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -5214,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5214
5602
|
if (src->backend == GGML_BACKEND_CPU) {
|
5215
5603
|
kind = cudaMemcpyHostToDevice;
|
5216
5604
|
src_ptr = (char *) src->data;
|
5217
|
-
} else if (src->backend == GGML_BACKEND_GPU) {
|
5605
|
+
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5606
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5218
5607
|
kind = cudaMemcpyDeviceToDevice;
|
5219
5608
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5220
5609
|
int id;
|
@@ -5253,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5253
5642
|
}
|
5254
5643
|
|
5255
5644
|
inline void ggml_cuda_op_add(
|
5256
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5257
|
-
float *
|
5258
|
-
cudaStream_t & cudaStream_main){
|
5645
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5646
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5259
5647
|
|
5260
|
-
GGML_ASSERT(
|
5261
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5262
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5263
|
-
|
5264
|
-
const int64_t ne00 = src0->ne[0];
|
5265
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5648
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5266
5649
|
|
5267
5650
|
const int64_t ne10 = src1->ne[0];
|
5268
5651
|
const int64_t ne11 = src1->ne[1];
|
5269
5652
|
|
5270
|
-
// compute
|
5271
5653
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
5272
|
-
add_f32_cuda(
|
5654
|
+
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5273
5655
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5274
|
-
add_f16_f32_f16_cuda((half *)
|
5656
|
+
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
5275
5657
|
} else {
|
5276
5658
|
GGML_ASSERT(false);
|
5277
5659
|
}
|
5278
5660
|
|
5279
5661
|
(void) src1;
|
5280
5662
|
(void) dst;
|
5281
|
-
(void) src0_ddq_i;
|
5282
|
-
(void) i02;
|
5283
|
-
(void) i1;
|
5284
5663
|
}
|
5285
5664
|
|
5286
5665
|
inline void ggml_cuda_op_mul(
|
5287
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5288
|
-
float *
|
5289
|
-
cudaStream_t & cudaStream_main){
|
5290
|
-
|
5291
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5292
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5293
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5666
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5667
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5294
5668
|
|
5295
|
-
|
5296
|
-
|
5669
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5670
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5671
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5297
5672
|
|
5298
5673
|
const int64_t ne10 = src1->ne[0];
|
5299
5674
|
const int64_t ne11 = src1->ne[1];
|
5300
5675
|
|
5301
|
-
mul_f32_cuda(
|
5676
|
+
mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5302
5677
|
|
5303
5678
|
(void) dst;
|
5304
|
-
(void) src0_ddq_i;
|
5305
|
-
(void) i02;
|
5306
|
-
(void) i1;
|
5307
5679
|
}
|
5308
5680
|
|
5309
5681
|
inline void ggml_cuda_op_gelu(
|
5310
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5311
|
-
float *
|
5312
|
-
cudaStream_t & cudaStream_main){
|
5682
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5683
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5313
5684
|
|
5314
|
-
GGML_ASSERT(
|
5315
|
-
GGML_ASSERT(
|
5685
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5686
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5316
5687
|
|
5317
|
-
|
5318
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5319
|
-
|
5320
|
-
// compute
|
5321
|
-
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5688
|
+
gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5322
5689
|
|
5323
5690
|
(void) src1;
|
5324
5691
|
(void) dst;
|
5325
|
-
(void)
|
5326
|
-
(void) src1_ddf_i;
|
5327
|
-
(void) i02;
|
5328
|
-
(void) i1;
|
5692
|
+
(void) src1_dd;
|
5329
5693
|
}
|
5330
5694
|
|
5331
5695
|
inline void ggml_cuda_op_silu(
|
5332
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5333
|
-
float *
|
5334
|
-
cudaStream_t & cudaStream_main){
|
5696
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5697
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5335
5698
|
|
5336
|
-
GGML_ASSERT(
|
5337
|
-
GGML_ASSERT(
|
5699
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5700
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5338
5701
|
|
5339
|
-
|
5340
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5341
|
-
|
5342
|
-
// compute
|
5343
|
-
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5702
|
+
silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5344
5703
|
|
5345
5704
|
(void) src1;
|
5346
5705
|
(void) dst;
|
5347
|
-
(void)
|
5348
|
-
(void) src1_ddf_i;
|
5349
|
-
(void) i02;
|
5350
|
-
(void) i1;
|
5706
|
+
(void) src1_dd;
|
5351
5707
|
}
|
5352
5708
|
|
5353
5709
|
inline void ggml_cuda_op_norm(
|
5354
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5355
|
-
float *
|
5356
|
-
cudaStream_t & cudaStream_main){
|
5710
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5711
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5357
5712
|
|
5358
|
-
GGML_ASSERT(
|
5359
|
-
GGML_ASSERT(
|
5713
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5714
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5360
5715
|
|
5361
5716
|
const int64_t ne00 = src0->ne[0];
|
5362
|
-
const int64_t
|
5717
|
+
const int64_t nrows = ggml_nrows(src0);
|
5363
5718
|
|
5364
|
-
|
5365
|
-
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
5719
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5366
5720
|
|
5367
5721
|
(void) src1;
|
5368
5722
|
(void) dst;
|
5369
|
-
(void)
|
5370
|
-
(void) src1_ddf_i;
|
5371
|
-
(void) i02;
|
5372
|
-
(void) i1;
|
5723
|
+
(void) src1_dd;
|
5373
5724
|
}
|
5374
5725
|
|
5375
5726
|
inline void ggml_cuda_op_rms_norm(
|
5376
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5377
|
-
float *
|
5378
|
-
cudaStream_t & cudaStream_main){
|
5727
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5728
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5379
5729
|
|
5380
|
-
GGML_ASSERT(
|
5381
|
-
GGML_ASSERT(
|
5730
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5731
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5382
5732
|
|
5383
5733
|
const int64_t ne00 = src0->ne[0];
|
5384
|
-
const int64_t
|
5734
|
+
const int64_t nrows = ggml_nrows(src0);
|
5385
5735
|
|
5386
5736
|
float eps;
|
5387
5737
|
memcpy(&eps, dst->op_params, sizeof(float));
|
5388
5738
|
|
5389
|
-
|
5390
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
5739
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
5391
5740
|
|
5392
5741
|
(void) src1;
|
5393
5742
|
(void) dst;
|
5394
|
-
(void)
|
5395
|
-
(void) src1_ddf_i;
|
5396
|
-
(void) i02;
|
5397
|
-
(void) i1;
|
5743
|
+
(void) src1_dd;
|
5398
5744
|
}
|
5399
5745
|
|
5400
5746
|
inline void ggml_cuda_op_mul_mat_q(
|
5401
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5402
|
-
|
5403
|
-
cudaStream_t &
|
5404
|
-
|
5405
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5406
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5407
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5747
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5748
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5749
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5408
5750
|
|
5409
5751
|
const int64_t ne00 = src0->ne[0];
|
5410
5752
|
|
5411
5753
|
const int64_t ne10 = src1->ne[0];
|
5412
|
-
const int64_t ne11 = src1->ne[1];
|
5413
5754
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
5414
5755
|
|
5415
5756
|
const int64_t ne0 = dst->ne[0];
|
5416
5757
|
|
5417
|
-
const int64_t
|
5758
|
+
const int64_t row_diff = row_high - row_low;
|
5418
5759
|
|
5419
5760
|
int id;
|
5420
5761
|
CUDA_CHECK(cudaGetDevice(&id));
|
5421
5762
|
|
5422
5763
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5423
5764
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
5424
|
-
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
5425
|
-
|
5426
|
-
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
5427
|
-
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5428
|
-
size_t as;
|
5429
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
5430
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
5765
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5431
5766
|
|
5432
5767
|
switch (src0->type) {
|
5433
5768
|
case GGML_TYPE_Q4_0:
|
5434
|
-
ggml_mul_mat_q4_0_q8_1_cuda(
|
5769
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5435
5770
|
break;
|
5436
5771
|
case GGML_TYPE_Q4_1:
|
5437
|
-
ggml_mul_mat_q4_1_q8_1_cuda(
|
5772
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5438
5773
|
break;
|
5439
5774
|
case GGML_TYPE_Q5_0:
|
5440
|
-
ggml_mul_mat_q5_0_q8_1_cuda(
|
5775
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5441
5776
|
break;
|
5442
5777
|
case GGML_TYPE_Q5_1:
|
5443
|
-
ggml_mul_mat_q5_1_q8_1_cuda(
|
5778
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5444
5779
|
break;
|
5445
5780
|
case GGML_TYPE_Q8_0:
|
5446
|
-
ggml_mul_mat_q8_0_q8_1_cuda(
|
5781
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5447
5782
|
break;
|
5448
5783
|
case GGML_TYPE_Q2_K:
|
5449
|
-
ggml_mul_mat_q2_K_q8_1_cuda(
|
5784
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5450
5785
|
break;
|
5451
5786
|
case GGML_TYPE_Q3_K:
|
5452
|
-
ggml_mul_mat_q3_K_q8_1_cuda(
|
5787
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5453
5788
|
break;
|
5454
5789
|
case GGML_TYPE_Q4_K:
|
5455
|
-
ggml_mul_mat_q4_K_q8_1_cuda(
|
5790
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5456
5791
|
break;
|
5457
5792
|
case GGML_TYPE_Q5_K:
|
5458
|
-
ggml_mul_mat_q5_K_q8_1_cuda(
|
5793
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5459
5794
|
break;
|
5460
5795
|
case GGML_TYPE_Q6_K:
|
5461
|
-
ggml_mul_mat_q6_K_q8_1_cuda(
|
5796
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5462
5797
|
break;
|
5463
5798
|
default:
|
5464
5799
|
GGML_ASSERT(false);
|
5465
5800
|
break;
|
5466
5801
|
}
|
5467
5802
|
|
5468
|
-
ggml_cuda_pool_free(src1_q8_1, as);
|
5469
|
-
|
5470
5803
|
(void) src1;
|
5471
5804
|
(void) dst;
|
5472
|
-
(void)
|
5473
|
-
(void) i02;
|
5474
|
-
(void) i1;
|
5805
|
+
(void) src1_ddf_i;
|
5475
5806
|
}
|
5476
5807
|
|
5477
5808
|
static int64_t get_row_rounding(ggml_type type) {
|
5478
|
-
|
5479
|
-
|
5480
|
-
|
5481
|
-
|
5482
|
-
|
5809
|
+
int64_t min_compute_capability = INT_MAX;
|
5810
|
+
int64_t max_compute_capability = INT_MIN;
|
5811
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5812
|
+
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5813
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5814
|
+
min_compute_capability = g_compute_capabilities[id];
|
5815
|
+
}
|
5816
|
+
if (max_compute_capability < g_compute_capabilities[id]) {
|
5817
|
+
max_compute_capability = g_compute_capabilities[id];
|
5818
|
+
}
|
5483
5819
|
}
|
5484
5820
|
}
|
5485
5821
|
|
5822
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5823
|
+
switch(type) {
|
5824
|
+
case GGML_TYPE_Q4_0:
|
5825
|
+
case GGML_TYPE_Q4_1:
|
5826
|
+
case GGML_TYPE_Q5_0:
|
5827
|
+
case GGML_TYPE_Q5_1:
|
5828
|
+
case GGML_TYPE_Q8_0:
|
5829
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5830
|
+
case GGML_TYPE_F16:
|
5831
|
+
return 1;
|
5832
|
+
case GGML_TYPE_Q2_K:
|
5833
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
5834
|
+
case GGML_TYPE_Q3_K:
|
5835
|
+
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
5836
|
+
case GGML_TYPE_Q4_K:
|
5837
|
+
case GGML_TYPE_Q5_K:
|
5838
|
+
case GGML_TYPE_Q6_K:
|
5839
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5840
|
+
default:
|
5841
|
+
GGML_ASSERT(false);
|
5842
|
+
}
|
5843
|
+
#else
|
5486
5844
|
switch(type) {
|
5487
5845
|
case GGML_TYPE_Q4_0:
|
5488
5846
|
case GGML_TYPE_Q4_1:
|
@@ -5503,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5503
5861
|
default:
|
5504
5862
|
GGML_ASSERT(false);
|
5505
5863
|
}
|
5864
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5506
5865
|
}
|
5507
5866
|
|
5508
|
-
inline void
|
5509
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5510
|
-
|
5511
|
-
cudaStream_t &
|
5512
|
-
|
5513
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5514
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5515
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5867
|
+
inline void ggml_cuda_op_mul_mat_vec_q(
|
5868
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5869
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5870
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5516
5871
|
|
5517
5872
|
const int64_t ne00 = src0->ne[0];
|
5518
|
-
const int64_t
|
5873
|
+
const int64_t row_diff = row_high - row_low;
|
5519
5874
|
|
5520
|
-
|
5521
|
-
|
5522
|
-
|
5523
|
-
|
5524
|
-
|
5525
|
-
|
5875
|
+
switch (src0->type) {
|
5876
|
+
case GGML_TYPE_Q4_0:
|
5877
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5878
|
+
break;
|
5879
|
+
case GGML_TYPE_Q4_1:
|
5880
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5881
|
+
break;
|
5882
|
+
case GGML_TYPE_Q5_0:
|
5883
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5884
|
+
break;
|
5885
|
+
case GGML_TYPE_Q5_1:
|
5886
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5887
|
+
break;
|
5888
|
+
case GGML_TYPE_Q8_0:
|
5889
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5890
|
+
break;
|
5891
|
+
case GGML_TYPE_Q2_K:
|
5892
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5893
|
+
break;
|
5894
|
+
case GGML_TYPE_Q3_K:
|
5895
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5896
|
+
break;
|
5897
|
+
case GGML_TYPE_Q4_K:
|
5898
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5899
|
+
break;
|
5900
|
+
case GGML_TYPE_Q5_K:
|
5901
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5902
|
+
break;
|
5903
|
+
case GGML_TYPE_Q6_K:
|
5904
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5905
|
+
break;
|
5906
|
+
default:
|
5907
|
+
GGML_ASSERT(false);
|
5908
|
+
break;
|
5909
|
+
}
|
5526
5910
|
|
5527
|
-
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5531
|
-
|
5532
|
-
|
5533
|
-
#if QK_K == 256
|
5534
|
-
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
5535
|
-
src0->type == GGML_TYPE_Q2_K ||
|
5536
|
-
src0->type == GGML_TYPE_Q3_K ||
|
5537
|
-
src0->type == GGML_TYPE_Q4_K ||
|
5538
|
-
src0->type == GGML_TYPE_Q5_K ||
|
5539
|
-
src0->type == GGML_TYPE_Q6_K;
|
5540
|
-
#endif // QK_K == 256
|
5541
|
-
|
5542
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
5543
|
-
#endif
|
5911
|
+
(void) src1;
|
5912
|
+
(void) dst;
|
5913
|
+
(void) src1_ddf_i;
|
5914
|
+
(void) src1_ncols;
|
5915
|
+
(void) src1_padded_row_size;
|
5916
|
+
}
|
5544
5917
|
|
5545
|
-
|
5546
|
-
|
5547
|
-
|
5548
|
-
|
5549
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
5550
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
5551
|
-
|
5552
|
-
switch (src0->type) {
|
5553
|
-
case GGML_TYPE_Q4_0:
|
5554
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5555
|
-
break;
|
5556
|
-
case GGML_TYPE_Q4_1:
|
5557
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5558
|
-
break;
|
5559
|
-
case GGML_TYPE_Q5_0:
|
5560
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5561
|
-
break;
|
5562
|
-
case GGML_TYPE_Q5_1:
|
5563
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5564
|
-
break;
|
5565
|
-
case GGML_TYPE_Q8_0:
|
5566
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5567
|
-
break;
|
5568
|
-
case GGML_TYPE_Q2_K:
|
5569
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5570
|
-
break;
|
5571
|
-
case GGML_TYPE_Q3_K:
|
5572
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5573
|
-
break;
|
5574
|
-
case GGML_TYPE_Q4_K:
|
5575
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5576
|
-
break;
|
5577
|
-
case GGML_TYPE_Q5_K:
|
5578
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5579
|
-
break;
|
5580
|
-
case GGML_TYPE_Q6_K:
|
5581
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5582
|
-
break;
|
5583
|
-
default:
|
5584
|
-
GGML_ASSERT(false);
|
5585
|
-
break;
|
5586
|
-
}
|
5918
|
+
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
5919
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5920
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5921
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5587
5922
|
|
5588
|
-
|
5589
|
-
|
5590
|
-
|
5923
|
+
const int64_t ne00 = src0->ne[0];
|
5924
|
+
const int64_t row_diff = row_high - row_low;
|
5925
|
+
|
5926
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
5591
5927
|
#ifdef GGML_CUDA_F16
|
5592
|
-
|
5593
|
-
|
5594
|
-
|
5595
|
-
|
5596
|
-
|
5597
|
-
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5602
|
-
|
5603
|
-
|
5604
|
-
|
5928
|
+
size_t ash;
|
5929
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
5930
|
+
|
5931
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
5932
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
5933
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
5934
|
+
|
5935
|
+
if (src1_convert_f16) {
|
5936
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
5937
|
+
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
5938
|
+
ne00, 1, sizeof(float), 0, 0,
|
5939
|
+
ne00, 1, sizeof(half), 0, 0, stream);
|
5940
|
+
}
|
5605
5941
|
#else
|
5606
|
-
|
5942
|
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
5607
5943
|
#endif // GGML_CUDA_F16
|
5608
5944
|
|
5609
|
-
|
5610
|
-
|
5611
|
-
|
5612
|
-
|
5613
|
-
|
5614
|
-
|
5615
|
-
|
5616
|
-
|
5617
|
-
|
5618
|
-
|
5619
|
-
|
5620
|
-
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5630
|
-
|
5631
|
-
|
5632
|
-
|
5633
|
-
|
5634
|
-
|
5635
|
-
|
5636
|
-
|
5637
|
-
|
5638
|
-
|
5639
|
-
|
5640
|
-
|
5641
|
-
|
5642
|
-
|
5643
|
-
|
5644
|
-
|
5645
|
-
|
5646
|
-
|
5945
|
+
switch (src0->type) {
|
5946
|
+
case GGML_TYPE_Q4_0:
|
5947
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5948
|
+
break;
|
5949
|
+
case GGML_TYPE_Q4_1:
|
5950
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5951
|
+
break;
|
5952
|
+
case GGML_TYPE_Q5_0:
|
5953
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5954
|
+
break;
|
5955
|
+
case GGML_TYPE_Q5_1:
|
5956
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5957
|
+
break;
|
5958
|
+
case GGML_TYPE_Q8_0:
|
5959
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5960
|
+
break;
|
5961
|
+
case GGML_TYPE_Q2_K:
|
5962
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5963
|
+
break;
|
5964
|
+
case GGML_TYPE_Q3_K:
|
5965
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5966
|
+
break;
|
5967
|
+
case GGML_TYPE_Q4_K:
|
5968
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5969
|
+
break;
|
5970
|
+
case GGML_TYPE_Q5_K:
|
5971
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5972
|
+
break;
|
5973
|
+
case GGML_TYPE_Q6_K:
|
5974
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5975
|
+
break;
|
5976
|
+
case GGML_TYPE_F16:
|
5977
|
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5978
|
+
break;
|
5979
|
+
default:
|
5980
|
+
GGML_ASSERT(false);
|
5981
|
+
break;
|
5982
|
+
}
|
5647
5983
|
|
5648
5984
|
#ifdef GGML_CUDA_F16
|
5649
|
-
|
5650
|
-
|
5651
|
-
}
|
5652
|
-
#endif // GGML_CUDA_F16
|
5985
|
+
if (src1_convert_f16) {
|
5986
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
5653
5987
|
}
|
5988
|
+
#endif // GGML_CUDA_F16
|
5654
5989
|
|
5655
5990
|
(void) src1;
|
5656
5991
|
(void) dst;
|
5657
|
-
(void)
|
5658
|
-
(void)
|
5659
|
-
(void)
|
5992
|
+
(void) src1_ddq_i;
|
5993
|
+
(void) src1_ncols;
|
5994
|
+
(void) src1_padded_row_size;
|
5660
5995
|
}
|
5661
5996
|
|
5662
5997
|
inline void ggml_cuda_op_mul_mat_cublas(
|
5663
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5664
|
-
|
5665
|
-
cudaStream_t &
|
5998
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5999
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6000
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5666
6001
|
|
5667
|
-
GGML_ASSERT(
|
6002
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
5668
6003
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
5669
|
-
GGML_ASSERT(
|
6004
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
5670
6005
|
|
5671
6006
|
const float alpha = 1.0f;
|
5672
6007
|
const float beta = 0.0f;
|
@@ -5674,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
5674
6009
|
const int64_t ne00 = src0->ne[0];
|
5675
6010
|
|
5676
6011
|
const int64_t ne10 = src1->ne[0];
|
5677
|
-
const int64_t ne11 = src1->ne[1];
|
5678
6012
|
|
5679
6013
|
const int64_t ne0 = dst->ne[0];
|
5680
|
-
const int64_t
|
6014
|
+
const int64_t row_diff = row_high - row_low;
|
6015
|
+
|
6016
|
+
float * src0_ddq_as_f32;
|
6017
|
+
size_t src0_as = 0;
|
6018
|
+
|
6019
|
+
if (src0->type != GGML_TYPE_F32) {
|
6020
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6021
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6022
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6023
|
+
}
|
6024
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
5681
6025
|
|
5682
6026
|
int id;
|
5683
6027
|
CUDA_CHECK(cudaGetDevice(&id));
|
5684
6028
|
|
5685
6029
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5686
6030
|
// ldc == nrows of the matrix that cuBLAS writes into
|
5687
|
-
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
6031
|
+
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5688
6032
|
|
5689
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id],
|
6033
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
5690
6034
|
CUBLAS_CHECK(
|
5691
6035
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
5692
|
-
|
6036
|
+
row_diff, src1_ncols, ne10,
|
5693
6037
|
&alpha, src0_ddf_i, ne00,
|
5694
|
-
src1_ddf_i,
|
5695
|
-
&beta,
|
6038
|
+
src1_ddf_i, ne10,
|
6039
|
+
&beta, dst_dd_i, ldc));
|
6040
|
+
|
6041
|
+
if (src0_as > 0) {
|
6042
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6043
|
+
}
|
5696
6044
|
|
5697
6045
|
(void) dst;
|
5698
|
-
(void)
|
5699
|
-
(void)
|
5700
|
-
(void) i1;
|
6046
|
+
(void) src1_ddq_i;
|
6047
|
+
(void) src1_padded_row_size;
|
5701
6048
|
}
|
5702
6049
|
|
5703
6050
|
inline void ggml_cuda_op_rope(
|
5704
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5705
|
-
float *
|
5706
|
-
cudaStream_t & cudaStream_main){
|
6051
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6052
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5707
6053
|
|
5708
|
-
GGML_ASSERT(
|
5709
|
-
GGML_ASSERT(
|
6054
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6055
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5710
6056
|
|
5711
6057
|
const int64_t ne00 = src0->ne[0];
|
5712
6058
|
const int64_t ne01 = src0->ne[1];
|
5713
|
-
const int64_t
|
6059
|
+
const int64_t nrows = ggml_nrows(src0);
|
5714
6060
|
|
5715
6061
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5716
6062
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
@@ -5723,44 +6069,37 @@ inline void ggml_cuda_op_rope(
|
|
5723
6069
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
5724
6070
|
|
5725
6071
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6072
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5726
6073
|
|
5727
6074
|
const bool is_neox = mode & 2;
|
5728
6075
|
const bool is_glm = mode & 4;
|
5729
6076
|
|
5730
6077
|
// compute
|
5731
6078
|
if (is_glm) {
|
5732
|
-
|
5733
|
-
const float id_p = min(p, n_ctx - 2.f);
|
5734
|
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5735
|
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
6079
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
5736
6080
|
} else if (is_neox) {
|
5737
6081
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5738
|
-
|
5739
|
-
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6082
|
+
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5740
6083
|
} else {
|
5741
|
-
|
5742
|
-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6084
|
+
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5743
6085
|
}
|
5744
6086
|
|
5745
6087
|
(void) src1;
|
5746
6088
|
(void) dst;
|
5747
|
-
(void)
|
5748
|
-
(void) src1_ddf_i;
|
5749
|
-
(void) i1;
|
6089
|
+
(void) src1_dd;
|
5750
6090
|
}
|
5751
6091
|
|
5752
6092
|
inline void ggml_cuda_op_alibi(
|
5753
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5754
|
-
float *
|
5755
|
-
cudaStream_t & cudaStream_main){
|
6093
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6094
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5756
6095
|
|
5757
|
-
GGML_ASSERT(
|
5758
|
-
GGML_ASSERT(
|
6096
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6097
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5759
6098
|
|
5760
6099
|
const int64_t ne00 = src0->ne[0];
|
5761
6100
|
const int64_t ne01 = src0->ne[1];
|
5762
6101
|
const int64_t ne02 = src0->ne[2];
|
5763
|
-
const int64_t
|
6102
|
+
const int64_t nrows = ggml_nrows(src0);
|
5764
6103
|
|
5765
6104
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5766
6105
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
@@ -5775,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
|
|
5775
6114
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5776
6115
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5777
6116
|
|
5778
|
-
|
5779
|
-
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
6117
|
+
alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
5780
6118
|
|
5781
6119
|
(void) src1;
|
5782
|
-
(void)
|
5783
|
-
(void) src1_ddf_i;
|
5784
|
-
(void) i1;
|
6120
|
+
(void) src1_dd;
|
5785
6121
|
}
|
5786
6122
|
|
5787
6123
|
inline void ggml_cuda_op_diag_mask_inf(
|
5788
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5789
|
-
float *
|
5790
|
-
cudaStream_t & cudaStream_main){
|
6124
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6125
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5791
6126
|
|
5792
|
-
GGML_ASSERT(
|
5793
|
-
GGML_ASSERT(
|
6127
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6128
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5794
6129
|
|
5795
6130
|
const int64_t ne00 = src0->ne[0];
|
5796
6131
|
const int64_t ne01 = src0->ne[1];
|
5797
|
-
const
|
6132
|
+
const int nrows0 = ggml_nrows(src0);
|
5798
6133
|
|
5799
6134
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5800
6135
|
|
5801
|
-
|
5802
|
-
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
6136
|
+
diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
5803
6137
|
|
5804
6138
|
(void) src1;
|
5805
6139
|
(void) dst;
|
5806
|
-
(void)
|
5807
|
-
(void) src1_ddf_i;
|
5808
|
-
(void) i02;
|
5809
|
-
(void) i1;
|
6140
|
+
(void) src1_dd;
|
5810
6141
|
}
|
5811
6142
|
|
5812
6143
|
inline void ggml_cuda_op_soft_max(
|
5813
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5814
|
-
float *
|
5815
|
-
cudaStream_t & cudaStream_main){
|
6144
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6145
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5816
6146
|
|
5817
|
-
GGML_ASSERT(
|
5818
|
-
GGML_ASSERT(
|
6147
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6148
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5819
6149
|
|
5820
6150
|
const int64_t ne00 = src0->ne[0];
|
5821
|
-
const int64_t
|
6151
|
+
const int64_t nrows = ggml_nrows(src0);
|
5822
6152
|
|
5823
|
-
|
5824
|
-
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
6153
|
+
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5825
6154
|
|
5826
6155
|
(void) src1;
|
5827
6156
|
(void) dst;
|
5828
|
-
(void)
|
5829
|
-
(void) src1_ddf_i;
|
5830
|
-
(void) i02;
|
5831
|
-
(void) i1;
|
6157
|
+
(void) src1_dd;
|
5832
6158
|
}
|
5833
6159
|
|
5834
6160
|
inline void ggml_cuda_op_scale(
|
5835
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5836
|
-
float *
|
5837
|
-
cudaStream_t & cudaStream_main){
|
6161
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6162
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5838
6163
|
|
5839
|
-
GGML_ASSERT(
|
5840
|
-
GGML_ASSERT(
|
6164
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6165
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6166
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5841
6167
|
|
5842
6168
|
const float scale = ((float *) src1->data)[0];
|
5843
6169
|
|
5844
|
-
|
5845
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5846
|
-
|
5847
|
-
// compute
|
5848
|
-
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
6170
|
+
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
5849
6171
|
CUDA_CHECK(cudaGetLastError());
|
5850
6172
|
|
5851
6173
|
(void) src1;
|
5852
6174
|
(void) dst;
|
5853
|
-
(void)
|
5854
|
-
|
5855
|
-
|
5856
|
-
|
6175
|
+
(void) src1_dd;
|
6176
|
+
}
|
6177
|
+
|
6178
|
+
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6179
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
6180
|
+
|
6181
|
+
const bool use_src1 = src1 != nullptr;
|
6182
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6183
|
+
|
6184
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6185
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6186
|
+
|
6187
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6188
|
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6189
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6190
|
+
|
6191
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6192
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
6193
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
6194
|
+
|
6195
|
+
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
6196
|
+
|
6197
|
+
// dd = data device
|
6198
|
+
float * src0_ddf = nullptr;
|
6199
|
+
float * src1_ddf = nullptr;
|
6200
|
+
float * dst_ddf = nullptr;
|
6201
|
+
|
6202
|
+
// as = actual size
|
6203
|
+
size_t src0_asf = 0;
|
6204
|
+
size_t src1_asf = 0;
|
6205
|
+
size_t dst_asf = 0;
|
6206
|
+
|
6207
|
+
ggml_cuda_set_device(g_main_device);
|
6208
|
+
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6209
|
+
|
6210
|
+
if (src0_on_device) {
|
6211
|
+
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
6212
|
+
} else {
|
6213
|
+
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
6214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
6215
|
+
}
|
6216
|
+
|
6217
|
+
if (use_src1 && !src1_stays_on_host) {
|
6218
|
+
if (src1_on_device) {
|
6219
|
+
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6220
|
+
} else {
|
6221
|
+
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
6222
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
6223
|
+
}
|
6224
|
+
}
|
6225
|
+
if (dst_on_device) {
|
6226
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6227
|
+
} else {
|
6228
|
+
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
6229
|
+
}
|
6230
|
+
|
6231
|
+
// do the computation
|
6232
|
+
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
6233
|
+
CUDA_CHECK(cudaGetLastError());
|
6234
|
+
|
6235
|
+
// copy dst to host if necessary
|
6236
|
+
if (!dst_on_device) {
|
6237
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
6238
|
+
}
|
6239
|
+
|
6240
|
+
if (src0_asf > 0) {
|
6241
|
+
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
6242
|
+
}
|
6243
|
+
if (src1_asf > 0) {
|
6244
|
+
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
6245
|
+
}
|
6246
|
+
if (dst_asf > 0) {
|
6247
|
+
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
6251
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
6252
|
+
}
|
5857
6253
|
}
|
5858
6254
|
|
5859
|
-
static void
|
5860
|
-
|
6255
|
+
static void ggml_cuda_op_mul_mat(
|
6256
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
|
+
const bool convert_src1_to_q8_1) {
|
6258
|
+
|
5861
6259
|
const int64_t ne00 = src0->ne[0];
|
5862
6260
|
const int64_t ne01 = src0->ne[1];
|
5863
6261
|
const int64_t ne02 = src0->ne[2];
|
5864
6262
|
const int64_t ne03 = src0->ne[3];
|
5865
6263
|
const int64_t nrows0 = ggml_nrows(src0);
|
5866
6264
|
|
5867
|
-
const
|
5868
|
-
const int64_t
|
5869
|
-
const int64_t
|
5870
|
-
const int64_t
|
5871
|
-
const int64_t
|
5872
|
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6265
|
+
const int64_t ne10 = src1->ne[0];
|
6266
|
+
const int64_t ne11 = src1->ne[1];
|
6267
|
+
const int64_t ne12 = src1->ne[2];
|
6268
|
+
const int64_t ne13 = src1->ne[3];
|
6269
|
+
const int64_t nrows1 = ggml_nrows(src1);
|
5873
6270
|
|
5874
6271
|
GGML_ASSERT(ne03 == ne13);
|
5875
6272
|
|
5876
6273
|
const int64_t ne0 = dst->ne[0];
|
5877
6274
|
const int64_t ne1 = dst->ne[1];
|
5878
6275
|
|
5879
|
-
const int nb2
|
5880
|
-
const int nb3
|
6276
|
+
const int nb2 = dst->nb[2];
|
6277
|
+
const int nb3 = dst->nb[3];
|
5881
6278
|
|
5882
6279
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
5883
|
-
GGML_ASSERT(
|
6280
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
5884
6281
|
|
5885
|
-
|
5886
|
-
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
5887
|
-
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
5888
|
-
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
5889
|
-
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
5890
|
-
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
5891
|
-
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
6282
|
+
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
5892
6283
|
|
5893
|
-
const int64_t
|
5894
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
5895
|
-
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
5896
|
-
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
5897
|
-
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
6284
|
+
const int64_t i02_divisor = ne12 / ne02;
|
5898
6285
|
|
5899
6286
|
const size_t src0_ts = ggml_type_size(src0->type);
|
5900
6287
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
6288
|
+
const size_t q8_1_ts = sizeof(block_q8_1);
|
6289
|
+
const size_t q8_1_bs = QK8_1;
|
5901
6290
|
|
5902
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
5903
|
-
struct ggml_tensor_extra_gpu * src1_extra =
|
5904
|
-
struct ggml_tensor_extra_gpu *
|
6291
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6292
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6293
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
5905
6294
|
|
5906
6295
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
5907
6296
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
5908
|
-
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
5909
6297
|
|
5910
|
-
const bool src1_is_contiguous =
|
5911
|
-
const
|
5912
|
-
|
6298
|
+
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
6299
|
+
const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
6300
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5913
6301
|
|
5914
6302
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6303
|
+
GGML_ASSERT(!(split && ne02 > 1));
|
6304
|
+
GGML_ASSERT(!(split && ne03 > 1));
|
5915
6305
|
GGML_ASSERT(!(split && ne02 < ne12));
|
5916
6306
|
|
5917
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
5918
|
-
|
5919
6307
|
// dd = data device
|
5920
|
-
char *
|
5921
|
-
float *
|
5922
|
-
|
5923
|
-
float *
|
5924
|
-
|
5925
|
-
//
|
5926
|
-
size_t
|
5927
|
-
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6308
|
+
char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6309
|
+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
6310
|
+
char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
|
6311
|
+
float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6312
|
+
|
6313
|
+
// as = actual size
|
6314
|
+
size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5928
6315
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5929
|
-
size_t
|
6316
|
+
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
6317
|
+
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5930
6318
|
|
5931
|
-
|
5932
|
-
|
5933
|
-
if (split && g_device_count > 1) {
|
5934
|
-
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5935
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
5936
|
-
}
|
6319
|
+
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6320
|
+
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
5937
6321
|
|
5938
|
-
for (
|
5939
|
-
|
5940
|
-
|
5941
|
-
|
6322
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6323
|
+
// by default, use all rows
|
6324
|
+
row_low[id] = 0;
|
6325
|
+
row_high[id] = ne01;
|
5942
6326
|
|
5943
|
-
|
5944
|
-
|
5945
|
-
|
5946
|
-
int64_t row_low, row_high;
|
6327
|
+
// for multi GPU, get the row boundaries from tensor split
|
6328
|
+
// and round to mul_mat_q tile sizes
|
5947
6329
|
if (split) {
|
5948
6330
|
const int64_t rounding = get_row_rounding(src0->type);
|
5949
6331
|
|
5950
|
-
|
5951
|
-
|
6332
|
+
if (id != 0) {
|
6333
|
+
row_low[id] = ne01*g_tensor_split[id];
|
6334
|
+
row_low[id] -= row_low[id] % rounding;
|
6335
|
+
}
|
5952
6336
|
|
5953
|
-
if (id
|
5954
|
-
row_high
|
5955
|
-
|
5956
|
-
row_high = nrows0*g_tensor_split[id + 1];
|
5957
|
-
row_high -= row_high % rounding;
|
6337
|
+
if (id != g_device_count - 1) {
|
6338
|
+
row_high[id] = ne01*g_tensor_split[id + 1];
|
6339
|
+
row_high[id] -= row_high[id] % rounding;
|
5958
6340
|
}
|
5959
|
-
} else {
|
5960
|
-
row_low = 0;
|
5961
|
-
row_high = nrows0*i02_divisor;
|
5962
6341
|
}
|
5963
|
-
|
6342
|
+
}
|
6343
|
+
|
6344
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6345
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
5964
6346
|
continue;
|
5965
6347
|
}
|
5966
6348
|
|
5967
|
-
|
5968
|
-
|
5969
|
-
cudaSetDevice(id);
|
5970
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
6349
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6350
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5971
6351
|
|
5972
|
-
|
5973
|
-
|
5974
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
5975
|
-
}
|
6352
|
+
ggml_cuda_set_device(id);
|
6353
|
+
const cudaStream_t stream = g_cudaStreams[id][0];
|
5976
6354
|
|
5977
6355
|
if (src0_on_device && src0_is_contiguous) {
|
5978
|
-
|
5979
|
-
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
5980
|
-
} else {
|
5981
|
-
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
5982
|
-
}
|
6356
|
+
src0_dd[id] = (char *) src0_extra->data_device[id];
|
5983
6357
|
} else {
|
5984
|
-
|
5985
|
-
|
5986
|
-
} else {
|
5987
|
-
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
5988
|
-
}
|
6358
|
+
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6359
|
+
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
5989
6360
|
}
|
5990
6361
|
|
5991
|
-
if (
|
5992
|
-
|
6362
|
+
if (src1_on_device && src1_is_contiguous) {
|
6363
|
+
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
6364
|
+
} else {
|
6365
|
+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
5993
6366
|
}
|
5994
6367
|
|
5995
|
-
if (
|
5996
|
-
|
5997
|
-
|
5998
|
-
|
5999
|
-
src1_ddf[id]
|
6368
|
+
if (convert_src1_to_q8_1) {
|
6369
|
+
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6370
|
+
|
6371
|
+
if (split && src1_on_device && src1_is_contiguous) {
|
6372
|
+
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6373
|
+
CUDA_CHECK(cudaGetLastError());
|
6000
6374
|
}
|
6001
6375
|
}
|
6376
|
+
|
6002
6377
|
if (dst_on_device) {
|
6003
|
-
|
6378
|
+
dst_dd[id] = (float *) dst_extra->data_device[id];
|
6004
6379
|
} else {
|
6005
|
-
size_t size_dst_ddf = split ?
|
6006
|
-
|
6380
|
+
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
6381
|
+
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
6007
6382
|
}
|
6383
|
+
}
|
6008
6384
|
|
6009
|
-
|
6010
|
-
|
6011
|
-
|
6012
|
-
|
6385
|
+
// if multiple devices are used they need to wait for the main device
|
6386
|
+
// here an event is recorded that signals that the main device has finished calculating the input data
|
6387
|
+
if (split && g_device_count > 1) {
|
6388
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6389
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6390
|
+
}
|
6013
6391
|
|
6014
|
-
|
6392
|
+
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6393
|
+
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6394
|
+
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6395
|
+
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
6015
6396
|
|
6016
|
-
|
6017
|
-
|
6018
|
-
|
6397
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6398
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
6399
|
+
continue;
|
6400
|
+
}
|
6019
6401
|
|
6020
|
-
|
6021
|
-
|
6022
|
-
|
6023
|
-
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
6024
|
-
continue;
|
6025
|
-
}
|
6026
|
-
if (i0 == i0_offset_low) {
|
6027
|
-
i01_low = row_low % rows_per_iter;
|
6028
|
-
}
|
6029
|
-
if (i0 == i0_offset_high) {
|
6030
|
-
i01_high = row_high % rows_per_iter;
|
6031
|
-
}
|
6032
|
-
}
|
6402
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6403
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6404
|
+
const int64_t row_diff = row_high[id] - row_low[id];
|
6033
6405
|
|
6034
|
-
|
6035
|
-
|
6036
|
-
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
6037
|
-
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
6038
|
-
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
6039
|
-
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
6406
|
+
ggml_cuda_set_device(id);
|
6407
|
+
const cudaStream_t stream = g_cudaStreams[id][is];
|
6040
6408
|
|
6041
|
-
|
6042
|
-
|
6043
|
-
|
6044
|
-
|
6045
|
-
|
6409
|
+
// wait for main GPU data if necessary
|
6410
|
+
if (split && (id != g_main_device || is != 0)) {
|
6411
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6412
|
+
}
|
6413
|
+
|
6414
|
+
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
6415
|
+
const int64_t i03 = i0 / ne12;
|
6416
|
+
const int64_t i02 = i0 % ne12;
|
6417
|
+
|
6418
|
+
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
6046
6419
|
|
6047
6420
|
// for split tensors the data begins at i0 == i0_offset_low
|
6048
|
-
char *
|
6049
|
-
float *
|
6050
|
-
|
6051
|
-
float *
|
6052
|
-
|
6053
|
-
// for split tensors the data pointer needs to be rounded down
|
6054
|
-
// to the bin edge for i03, i02 bins beyond the first
|
6055
|
-
if (i0 - i0_offset_low > 0) {
|
6056
|
-
GGML_ASSERT(!flatten_rows);
|
6057
|
-
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
6058
|
-
src0_ddf_i -= (row_low % ne01)*ne00;
|
6059
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
6060
|
-
}
|
6421
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
6422
|
+
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
6423
|
+
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
6424
|
+
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
6061
6425
|
|
6062
6426
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
6063
6427
|
// in that case an offset on dst_ddf_i is needed
|
6064
6428
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
6065
|
-
|
6429
|
+
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
6066
6430
|
}
|
6067
6431
|
|
6068
6432
|
// copy src0, src1 to device if necessary
|
6069
|
-
if (
|
6070
|
-
if (
|
6071
|
-
|
6072
|
-
|
6073
|
-
|
6074
|
-
|
6075
|
-
|
6076
|
-
GGML_ASSERT(!flatten_rows);
|
6433
|
+
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
6434
|
+
if (id != g_main_device) {
|
6435
|
+
if (convert_src1_to_q8_1) {
|
6436
|
+
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
6437
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
6438
|
+
cudaMemcpyDeviceToDevice, stream));
|
6439
|
+
} else {
|
6077
6440
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
6078
|
-
src1_ddf_i_source +=
|
6079
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source,
|
6080
|
-
cudaMemcpyDeviceToDevice,
|
6441
|
+
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
6442
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
6443
|
+
cudaMemcpyDeviceToDevice, stream));
|
6081
6444
|
}
|
6082
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
6083
|
-
GGML_ASSERT(!split);
|
6084
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
6085
|
-
} else {
|
6086
|
-
GGML_ASSERT(false);
|
6087
6445
|
}
|
6446
|
+
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
6447
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
6448
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
6449
|
+
} else {
|
6450
|
+
GGML_ASSERT(false);
|
6088
6451
|
}
|
6089
6452
|
|
6090
|
-
if (
|
6091
|
-
|
6092
|
-
|
6093
|
-
} else {
|
6094
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
6095
|
-
}
|
6453
|
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6454
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6455
|
+
CUDA_CHECK(cudaGetLastError());
|
6096
6456
|
}
|
6097
6457
|
|
6098
|
-
|
6099
|
-
|
6100
|
-
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
6101
|
-
CUDA_CHECK(cudaGetLastError());
|
6458
|
+
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
6459
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
6102
6460
|
}
|
6103
6461
|
|
6104
6462
|
// do the computation
|
6105
|
-
op(src0, src1, dst,
|
6463
|
+
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
6464
|
+
row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
|
6106
6465
|
CUDA_CHECK(cudaGetLastError());
|
6107
6466
|
|
6108
6467
|
// copy dst to host or other device if necessary
|
@@ -6124,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
6124
6483
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
6125
6484
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
6126
6485
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
6127
|
-
float * dhf_dst_i = (float *) ((char *) dst_off_device +
|
6128
|
-
|
6129
|
-
|
6486
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6487
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6488
|
+
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
6489
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
6490
|
+
row_diff*sizeof(float), src1_ncols, kind, stream));
|
6130
6491
|
} else {
|
6131
6492
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6132
|
-
|
6493
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6494
|
+
dhf_dst_i += src1_col_0*ne0;
|
6495
|
+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
6133
6496
|
}
|
6134
6497
|
}
|
6135
6498
|
|
6136
|
-
//
|
6137
|
-
if (split &&
|
6138
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[id],
|
6499
|
+
// add event for the main device to wait on until other device is done
|
6500
|
+
if (split && (id != g_main_device || is != 0)) {
|
6501
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
6139
6502
|
}
|
6140
6503
|
}
|
6141
6504
|
}
|
6142
6505
|
}
|
6143
6506
|
|
6144
|
-
|
6145
|
-
|
6146
|
-
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
6147
|
-
continue;
|
6148
|
-
}
|
6149
|
-
|
6150
|
-
CUDA_CHECK(cudaSetDevice(id));
|
6507
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6508
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6151
6509
|
|
6152
|
-
|
6153
|
-
|
6154
|
-
|
6155
|
-
if (src0_asf[id] > 0) {
|
6156
|
-
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
6510
|
+
// free buffers again when done
|
6511
|
+
if (src0_as[id] > 0) {
|
6512
|
+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
6157
6513
|
}
|
6158
6514
|
if (src1_asf[id] > 0) {
|
6159
6515
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
6160
6516
|
}
|
6161
|
-
if (
|
6162
|
-
ggml_cuda_pool_free(
|
6517
|
+
if (src1_asq[id] > 0) {
|
6518
|
+
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
6519
|
+
}
|
6520
|
+
if (dst_as[id] > 0) {
|
6521
|
+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
6163
6522
|
}
|
6164
6523
|
}
|
6165
6524
|
|
6166
6525
|
// main device waits for all other devices to be finished
|
6167
6526
|
if (split && g_device_count > 1) {
|
6168
|
-
|
6169
|
-
|
6170
|
-
|
6171
|
-
|
6527
|
+
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
6528
|
+
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
6529
|
+
|
6530
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
|
+
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6172
6534
|
}
|
6173
6535
|
}
|
6174
6536
|
}
|
6175
6537
|
|
6176
6538
|
if (dst->backend == GGML_BACKEND_CPU) {
|
6177
|
-
CUDA_CHECK(
|
6539
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6178
6540
|
CUDA_CHECK(cudaDeviceSynchronize());
|
6179
6541
|
}
|
6180
6542
|
}
|
6181
6543
|
|
6182
6544
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6183
|
-
|
6184
|
-
// Due to flatten_rows == true this does in practice not make a difference however.
|
6185
|
-
// Better solution would be nice but right now that would require disproportionate changes.
|
6186
|
-
GGML_ASSERT(
|
6187
|
-
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
6188
|
-
src1->type == GGML_TYPE_F32 &&
|
6189
|
-
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
6190
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
6545
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6191
6546
|
}
|
6192
6547
|
|
6193
6548
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6194
|
-
|
6195
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
6549
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6196
6550
|
}
|
6197
6551
|
|
6198
6552
|
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6199
|
-
|
6200
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
6553
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6201
6554
|
}
|
6202
6555
|
|
6203
6556
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6204
|
-
|
6205
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
6557
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6206
6558
|
}
|
6207
6559
|
|
6208
6560
|
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6209
|
-
|
6210
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
6561
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6211
6562
|
}
|
6212
6563
|
|
6213
6564
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6214
|
-
|
6215
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
6565
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6216
6566
|
}
|
6217
6567
|
|
6218
6568
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -6246,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6246
6596
|
|
6247
6597
|
const int64_t ne12 = src1->ne[2];
|
6248
6598
|
|
6249
|
-
CUDA_CHECK(
|
6250
|
-
cudaStream_t
|
6599
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6600
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6251
6601
|
|
6252
6602
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6253
6603
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6258,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6258
6608
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6259
6609
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6260
6610
|
|
6261
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12,
|
6611
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6262
6612
|
}
|
6263
6613
|
|
6264
6614
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -6277,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6277
6627
|
const int64_t nb01 = src0->nb[1];
|
6278
6628
|
const int64_t nb02 = src0->nb[2];
|
6279
6629
|
|
6280
|
-
CUDA_CHECK(
|
6281
|
-
cudaStream_t
|
6630
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6631
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6282
6632
|
|
6283
6633
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6284
6634
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6289,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6289
6639
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6290
6640
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6291
6641
|
|
6292
|
-
const
|
6293
|
-
const
|
6642
|
+
const int64_t row_stride_x = nb01 / sizeof(half);
|
6643
|
+
const int64_t channel_stride_x = nb02 / sizeof(half);
|
6294
6644
|
|
6295
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,
|
6645
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6296
6646
|
}
|
6297
6647
|
|
6298
6648
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6299
6649
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6300
6650
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6301
6651
|
|
6652
|
+
int64_t min_compute_capability = INT_MAX;
|
6653
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6654
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6655
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6656
|
+
min_compute_capability = g_compute_capabilities[id];
|
6657
|
+
}
|
6658
|
+
}
|
6659
|
+
|
6302
6660
|
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6303
6661
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6304
6662
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6305
6663
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6306
6664
|
}else if (src0->type == GGML_TYPE_F32) {
|
6307
|
-
|
6665
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6308
6666
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6309
6667
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
6310
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
6311
|
-
} else {
|
6312
|
-
int min_compute_capability = INT_MAX;
|
6313
|
-
for (int id = 0; id < g_device_count; ++id) {
|
6314
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6315
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6316
|
-
min_compute_capability = g_compute_capabilities[id];
|
6317
|
-
}
|
6318
|
-
}
|
6319
6668
|
|
6669
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
6670
|
+
const bool use_mul_mat_vec_q = false;
|
6671
|
+
#else
|
6672
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
6673
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
6674
|
+
|
6675
|
+
if (use_mul_mat_vec_q) {
|
6676
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
6677
|
+
} else {
|
6678
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
6679
|
+
}
|
6680
|
+
} else {
|
6320
6681
|
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
6321
|
-
|
6682
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
6322
6683
|
} else {
|
6323
|
-
|
6684
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6324
6685
|
}
|
6325
6686
|
}
|
6326
6687
|
} else {
|
@@ -6329,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6329
6690
|
}
|
6330
6691
|
|
6331
6692
|
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6332
|
-
|
6333
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
6693
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6334
6694
|
}
|
6335
6695
|
|
6336
6696
|
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6359,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6359
6719
|
const int64_t nb11 = src1->nb[1];
|
6360
6720
|
const int64_t nb12 = src1->nb[2];
|
6361
6721
|
|
6362
|
-
CUDA_CHECK(
|
6363
|
-
cudaStream_t
|
6722
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6723
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6364
6724
|
|
6365
6725
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6366
6726
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -6370,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6370
6730
|
|
6371
6731
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
6372
6732
|
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6373
|
-
ne10, ne11, nb10, nb11, nb12,
|
6733
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6374
6734
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
6375
6735
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6376
|
-
ne10, ne11, nb10, nb11, nb12,
|
6736
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6377
6737
|
} else {
|
6378
6738
|
GGML_ASSERT(false);
|
6379
6739
|
}
|
@@ -6387,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6387
6747
|
}
|
6388
6748
|
|
6389
6749
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6390
|
-
|
6391
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
6750
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6392
6751
|
}
|
6393
6752
|
|
6394
6753
|
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6395
|
-
|
6396
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
6754
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6397
6755
|
}
|
6398
6756
|
|
6399
6757
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6400
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6401
6758
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6402
|
-
|
6403
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
6404
|
-
const bool is_glm = mode & 4;
|
6405
|
-
|
6406
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6759
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6407
6760
|
}
|
6408
6761
|
|
6409
6762
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6410
|
-
|
6411
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6763
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6412
6764
|
}
|
6413
6765
|
|
6414
6766
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6418,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6418
6770
|
}
|
6419
6771
|
|
6420
6772
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
6421
|
-
|
6773
|
+
const int64_t nrows = ggml_nrows(tensor);
|
6422
6774
|
|
6423
6775
|
const int64_t ne0 = tensor->ne[0];
|
6424
6776
|
|
@@ -6428,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6428
6780
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6429
6781
|
memset(extra, 0, sizeof(*extra));
|
6430
6782
|
|
6431
|
-
for (
|
6783
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6432
6784
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
6433
6785
|
continue;
|
6434
6786
|
}
|
6435
6787
|
|
6436
|
-
|
6788
|
+
ggml_cuda_set_device(id);
|
6437
6789
|
|
6438
|
-
|
6790
|
+
int64_t row_low, row_high;
|
6439
6791
|
if (backend == GGML_BACKEND_GPU) {
|
6440
6792
|
row_low = 0;
|
6441
6793
|
row_high = nrows;
|
@@ -6485,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6485
6837
|
extra->data_device[id] = buf;
|
6486
6838
|
|
6487
6839
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6488
|
-
|
6840
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6841
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
6842
|
+
}
|
6489
6843
|
}
|
6490
6844
|
}
|
6491
6845
|
|
@@ -6499,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
6499
6853
|
|
6500
6854
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6501
6855
|
|
6502
|
-
for (
|
6856
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6503
6857
|
if (extra->data_device[id] != nullptr) {
|
6504
|
-
CUDA_CHECK(
|
6858
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6505
6859
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
6506
6860
|
}
|
6507
6861
|
|
6508
|
-
|
6509
|
-
|
6510
|
-
|
6862
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6863
|
+
if (extra->events[id][is] != nullptr) {
|
6864
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6865
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
6866
|
+
}
|
6511
6867
|
}
|
6512
6868
|
}
|
6513
6869
|
|
@@ -6559,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6559
6915
|
force_inplace;
|
6560
6916
|
const size_t size = ggml_nbytes(tensor);
|
6561
6917
|
|
6562
|
-
CUDA_CHECK(
|
6918
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6563
6919
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6564
6920
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6565
6921
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|