llama_cpp 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +0 -5
- data/ext/llama_cpp/src/ggml-cuda.cu +1011 -655
- data/ext/llama_cpp/src/ggml-metal.m +57 -15
- data/ext/llama_cpp/src/ggml-metal.metal +271 -137
- data/ext/llama_cpp/src/ggml.c +7 -3
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +617 -141
- data/ext/llama_cpp/src/llama.h +8 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -13,7 +13,7 @@
|
|
13
13
|
#ifdef __HIP_PLATFORM_AMD__
|
14
14
|
// for rocblas_initialize()
|
15
15
|
#include "rocblas/rocblas.h"
|
16
|
-
#endif
|
16
|
+
#endif // __HIP_PLATFORM_AMD__
|
17
17
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
18
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
19
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
@@ -68,19 +68,29 @@
|
|
68
68
|
#include <cuda_runtime.h>
|
69
69
|
#include <cublas_v2.h>
|
70
70
|
#include <cuda_fp16.h>
|
71
|
-
#endif
|
71
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
72
72
|
|
73
73
|
#include "ggml-cuda.h"
|
74
74
|
#include "ggml.h"
|
75
75
|
|
76
|
-
#define MIN_CC_DP4A
|
77
|
-
#
|
78
|
-
#define
|
79
|
-
#
|
76
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#define CC_TURING 700
|
78
|
+
#define CC_OFFSET_AMD 1000000
|
79
|
+
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
80
80
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
85
|
+
defined(__gfx1150__) || defined(__gfx1151__)
|
86
|
+
#define RDNA3
|
87
|
+
#endif
|
88
|
+
|
89
|
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
90
|
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
91
|
+
#define RDNA2
|
92
|
+
#endif
|
93
|
+
|
84
94
|
#ifndef __has_builtin
|
85
95
|
#define __has_builtin(x) 0
|
86
96
|
#endif
|
@@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
132
142
|
#endif
|
133
143
|
return c;
|
134
144
|
}
|
135
|
-
#endif
|
145
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
136
146
|
|
137
147
|
#if defined(_MSC_VER)
|
138
148
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -144,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
144
154
|
do { \
|
145
155
|
cudaError_t err_ = (err); \
|
146
156
|
if (err_ != cudaSuccess) { \
|
147
|
-
|
157
|
+
int id; \
|
158
|
+
cudaGetDevice(&id); \
|
159
|
+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
148
160
|
cudaGetErrorString(err_)); \
|
161
|
+
fprintf(stderr, "current device: %d\n", id); \
|
149
162
|
exit(1); \
|
150
163
|
} \
|
151
164
|
} while (0)
|
@@ -155,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
155
168
|
do { \
|
156
169
|
cublasStatus_t err_ = (err); \
|
157
170
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
171
|
+
int id; \
|
172
|
+
cudaGetDevice(&id); \
|
158
173
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
159
174
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
175
|
+
fprintf(stderr, "current device: %d\n", id); \
|
160
176
|
exit(1); \
|
161
177
|
} \
|
162
178
|
} while (0)
|
@@ -165,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
165
181
|
do { \
|
166
182
|
cublasStatus_t err_ = (err); \
|
167
183
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
184
|
+
int id; \
|
185
|
+
cudaGetDevice(&id); \
|
168
186
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
187
|
+
fprintf(stderr, "current device: %d\n", id); \
|
169
188
|
exit(1); \
|
170
189
|
} \
|
171
190
|
} while (0)
|
@@ -212,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
|
|
212
231
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
213
232
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
214
233
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
215
|
-
typedef void (*
|
216
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
217
|
-
|
218
|
-
cudaStream_t &
|
234
|
+
typedef void (*ggml_cuda_op_mul_mat_t)(
|
235
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
236
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
237
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream);
|
238
|
+
typedef void (*ggml_cuda_op_flatten_t)(
|
239
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
240
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
|
219
241
|
|
220
242
|
// QK = number of values after dequantization
|
221
243
|
// QR = QK / number of values before dequantization
|
@@ -396,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
396
418
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
397
419
|
#endif
|
398
420
|
|
421
|
+
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
|
+
|
423
|
+
#define MAX_STREAMS 8
|
424
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
425
|
+
|
399
426
|
struct ggml_tensor_extra_gpu {
|
400
427
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
401
|
-
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
428
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
|
402
429
|
};
|
403
430
|
|
431
|
+
// this is faster on Windows
|
432
|
+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
433
|
+
inline cudaError_t ggml_cuda_set_device(const int device) {
|
434
|
+
int current_device;
|
435
|
+
CUDA_CHECK(cudaGetDevice(¤t_device));
|
436
|
+
|
437
|
+
if (device == current_device) {
|
438
|
+
return cudaSuccess;
|
439
|
+
}
|
440
|
+
|
441
|
+
return cudaSetDevice(device);
|
442
|
+
}
|
443
|
+
|
404
444
|
static int g_device_count = -1;
|
405
445
|
static int g_main_device = 0;
|
406
446
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
@@ -413,8 +453,6 @@ static size_t g_scratch_offset = 0;
|
|
413
453
|
|
414
454
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
415
455
|
|
416
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
417
|
-
|
418
456
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
419
457
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
420
458
|
|
@@ -3444,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3444
3482
|
}
|
3445
3483
|
}
|
3446
3484
|
|
3485
|
+
#define MMQ_X_Q4_0_RDNA2 64
|
3486
|
+
#define MMQ_Y_Q4_0_RDNA2 128
|
3487
|
+
#define NWARPS_Q4_0_RDNA2 8
|
3488
|
+
#define MMQ_X_Q4_0_RDNA1 64
|
3489
|
+
#define MMQ_Y_Q4_0_RDNA1 64
|
3490
|
+
#define NWARPS_Q4_0_RDNA1 8
|
3447
3491
|
#define MMQ_X_Q4_0_AMPERE 64
|
3448
3492
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3449
3493
|
#define NWARPS_Q4_0_AMPERE 4
|
@@ -3451,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3451
3495
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3452
3496
|
#define NWARPS_Q4_0_PASCAL 8
|
3453
3497
|
|
3454
|
-
template <bool need_check> static __global__ void
|
3498
|
+
template <bool need_check> static __global__ void
|
3499
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3500
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3501
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
3502
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3503
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3504
|
+
mul_mat_q4_0(
|
3455
3505
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3456
3506
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3457
3507
|
|
3458
|
-
#if
|
3508
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3509
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3510
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
3511
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
3512
|
+
const int nwarps = NWARPS_Q4_0_RDNA2;
|
3513
|
+
#else
|
3514
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
3515
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
3516
|
+
const int nwarps = NWARPS_Q4_0_RDNA1;
|
3517
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3518
|
+
|
3519
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3520
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3521
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3522
|
+
|
3523
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3459
3524
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3460
3525
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3461
3526
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3478,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3478
3543
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3479
3544
|
}
|
3480
3545
|
|
3546
|
+
#define MMQ_X_Q4_1_RDNA2 64
|
3547
|
+
#define MMQ_Y_Q4_1_RDNA2 128
|
3548
|
+
#define NWARPS_Q4_1_RDNA2 8
|
3549
|
+
#define MMQ_X_Q4_1_RDNA1 64
|
3550
|
+
#define MMQ_Y_Q4_1_RDNA1 64
|
3551
|
+
#define NWARPS_Q4_1_RDNA1 8
|
3481
3552
|
#define MMQ_X_Q4_1_AMPERE 64
|
3482
3553
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3483
3554
|
#define NWARPS_Q4_1_AMPERE 4
|
@@ -3486,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3486
3557
|
#define NWARPS_Q4_1_PASCAL 8
|
3487
3558
|
|
3488
3559
|
template <bool need_check> static __global__ void
|
3489
|
-
#if
|
3560
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3561
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3562
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3563
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3564
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3490
3565
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3491
3566
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3492
3567
|
mul_mat_q4_1(
|
3493
3568
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3494
3569
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3495
3570
|
|
3496
|
-
#if
|
3571
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3572
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3573
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
3574
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
3575
|
+
const int nwarps = NWARPS_Q4_1_RDNA2;
|
3576
|
+
#else
|
3577
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
3578
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
3579
|
+
const int nwarps = NWARPS_Q4_1_RDNA1;
|
3580
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3581
|
+
|
3582
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3583
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3584
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3585
|
+
|
3586
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3497
3587
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3498
3588
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3499
3589
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3516,6 +3606,12 @@ template <bool need_check> static __global__ void
|
|
3516
3606
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3517
3607
|
}
|
3518
3608
|
|
3609
|
+
#define MMQ_X_Q5_0_RDNA2 64
|
3610
|
+
#define MMQ_Y_Q5_0_RDNA2 128
|
3611
|
+
#define NWARPS_Q5_0_RDNA2 8
|
3612
|
+
#define MMQ_X_Q5_0_RDNA1 64
|
3613
|
+
#define MMQ_Y_Q5_0_RDNA1 64
|
3614
|
+
#define NWARPS_Q5_0_RDNA1 8
|
3519
3615
|
#define MMQ_X_Q5_0_AMPERE 128
|
3520
3616
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3521
3617
|
#define NWARPS_Q5_0_AMPERE 4
|
@@ -3523,11 +3619,32 @@ template <bool need_check> static __global__ void
|
|
3523
3619
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3524
3620
|
#define NWARPS_Q5_0_PASCAL 8
|
3525
3621
|
|
3526
|
-
template <bool need_check> static __global__ void
|
3622
|
+
template <bool need_check> static __global__ void
|
3623
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3624
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3625
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
3626
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3627
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3628
|
+
mul_mat_q5_0(
|
3527
3629
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3528
3630
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3529
3631
|
|
3530
|
-
#if
|
3632
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3633
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3634
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
3635
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
3636
|
+
const int nwarps = NWARPS_Q5_0_RDNA2;
|
3637
|
+
#else
|
3638
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
3639
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
3640
|
+
const int nwarps = NWARPS_Q5_0_RDNA1;
|
3641
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3642
|
+
|
3643
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3644
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3645
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3646
|
+
|
3647
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3531
3648
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3532
3649
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3533
3650
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3550,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3550
3667
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3551
3668
|
}
|
3552
3669
|
|
3670
|
+
#define MMQ_X_Q5_1_RDNA2 64
|
3671
|
+
#define MMQ_Y_Q5_1_RDNA2 128
|
3672
|
+
#define NWARPS_Q5_1_RDNA2 8
|
3673
|
+
#define MMQ_X_Q5_1_RDNA1 64
|
3674
|
+
#define MMQ_Y_Q5_1_RDNA1 64
|
3675
|
+
#define NWARPS_Q5_1_RDNA1 8
|
3553
3676
|
#define MMQ_X_Q5_1_AMPERE 128
|
3554
3677
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3555
3678
|
#define NWARPS_Q5_1_AMPERE 4
|
@@ -3557,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3557
3680
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3558
3681
|
#define NWARPS_Q5_1_PASCAL 8
|
3559
3682
|
|
3560
|
-
template <bool need_check> static __global__ void
|
3683
|
+
template <bool need_check> static __global__ void
|
3684
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3685
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3686
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
3687
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3688
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3689
|
+
mul_mat_q5_1(
|
3561
3690
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3562
3691
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3563
3692
|
|
3564
|
-
#if
|
3693
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3694
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3695
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
3696
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
3697
|
+
const int nwarps = NWARPS_Q5_1_RDNA2;
|
3698
|
+
#else
|
3699
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
3700
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
3701
|
+
const int nwarps = NWARPS_Q5_1_RDNA1;
|
3702
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3703
|
+
|
3704
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3705
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3706
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3707
|
+
|
3708
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3565
3709
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3566
3710
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3567
3711
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3584,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3584
3728
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3585
3729
|
}
|
3586
3730
|
|
3731
|
+
#define MMQ_X_Q8_0_RDNA2 64
|
3732
|
+
#define MMQ_Y_Q8_0_RDNA2 128
|
3733
|
+
#define NWARPS_Q8_0_RDNA2 8
|
3734
|
+
#define MMQ_X_Q8_0_RDNA1 64
|
3735
|
+
#define MMQ_Y_Q8_0_RDNA1 64
|
3736
|
+
#define NWARPS_Q8_0_RDNA1 8
|
3587
3737
|
#define MMQ_X_Q8_0_AMPERE 128
|
3588
3738
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3589
3739
|
#define NWARPS_Q8_0_AMPERE 4
|
@@ -3591,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3591
3741
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3592
3742
|
#define NWARPS_Q8_0_PASCAL 8
|
3593
3743
|
|
3594
|
-
template <bool need_check> static __global__ void
|
3744
|
+
template <bool need_check> static __global__ void
|
3745
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3746
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3747
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
3748
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3749
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3750
|
+
mul_mat_q8_0(
|
3595
3751
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3596
3752
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3597
3753
|
|
3598
|
-
#if
|
3754
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3755
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3756
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
3757
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
3758
|
+
const int nwarps = NWARPS_Q8_0_RDNA2;
|
3759
|
+
#else
|
3760
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
3761
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
3762
|
+
const int nwarps = NWARPS_Q8_0_RDNA1;
|
3763
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3764
|
+
|
3765
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3766
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3767
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3768
|
+
|
3769
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3599
3770
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3600
3771
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3601
3772
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3618,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3618
3789
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3619
3790
|
}
|
3620
3791
|
|
3792
|
+
#define MMQ_X_Q2_K_RDNA2 64
|
3793
|
+
#define MMQ_Y_Q2_K_RDNA2 128
|
3794
|
+
#define NWARPS_Q2_K_RDNA2 8
|
3795
|
+
#define MMQ_X_Q2_K_RDNA1 128
|
3796
|
+
#define MMQ_Y_Q2_K_RDNA1 32
|
3797
|
+
#define NWARPS_Q2_K_RDNA1 8
|
3621
3798
|
#define MMQ_X_Q2_K_AMPERE 64
|
3622
3799
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3623
3800
|
#define NWARPS_Q2_K_AMPERE 4
|
@@ -3625,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3625
3802
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3626
3803
|
#define NWARPS_Q2_K_PASCAL 8
|
3627
3804
|
|
3628
|
-
template <bool need_check> static __global__ void
|
3805
|
+
template <bool need_check> static __global__ void
|
3806
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3807
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3808
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
3809
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3810
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3811
|
+
mul_mat_q2_K(
|
3629
3812
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3630
3813
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3631
3814
|
|
3632
|
-
#if
|
3815
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3816
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3817
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
3818
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
3819
|
+
const int nwarps = NWARPS_Q2_K_RDNA2;
|
3820
|
+
#else
|
3821
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
3822
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
3823
|
+
const int nwarps = NWARPS_Q2_K_RDNA1;
|
3824
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3825
|
+
|
3826
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3827
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3828
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3829
|
+
|
3830
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3633
3831
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3634
3832
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3635
3833
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3652,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3652
3850
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3653
3851
|
}
|
3654
3852
|
|
3853
|
+
#define MMQ_X_Q3_K_RDNA2 128
|
3854
|
+
#define MMQ_Y_Q3_K_RDNA2 64
|
3855
|
+
#define NWARPS_Q3_K_RDNA2 8
|
3856
|
+
#define MMQ_X_Q3_K_RDNA1 32
|
3857
|
+
#define MMQ_Y_Q3_K_RDNA1 128
|
3858
|
+
#define NWARPS_Q3_K_RDNA1 8
|
3655
3859
|
#define MMQ_X_Q3_K_AMPERE 128
|
3656
3860
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3657
3861
|
#define NWARPS_Q3_K_AMPERE 4
|
@@ -3660,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3660
3864
|
#define NWARPS_Q3_K_PASCAL 8
|
3661
3865
|
|
3662
3866
|
template <bool need_check> static __global__ void
|
3663
|
-
#if
|
3867
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3868
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3869
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3870
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3871
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3664
3872
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3665
3873
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3666
3874
|
mul_mat_q3_K(
|
3667
3875
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3668
3876
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3669
3877
|
|
3670
|
-
#if
|
3878
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3879
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3880
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
3881
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
3882
|
+
const int nwarps = NWARPS_Q3_K_RDNA2;
|
3883
|
+
#else
|
3884
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
3885
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
3886
|
+
const int nwarps = NWARPS_Q3_K_RDNA1;
|
3887
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3888
|
+
|
3889
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3890
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3891
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3892
|
+
|
3893
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3671
3894
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3672
3895
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3673
3896
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3690,6 +3913,12 @@ template <bool need_check> static __global__ void
|
|
3690
3913
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3691
3914
|
}
|
3692
3915
|
|
3916
|
+
#define MMQ_X_Q4_K_RDNA2 64
|
3917
|
+
#define MMQ_Y_Q4_K_RDNA2 128
|
3918
|
+
#define NWARPS_Q4_K_RDNA2 8
|
3919
|
+
#define MMQ_X_Q4_K_RDNA1 32
|
3920
|
+
#define MMQ_Y_Q4_K_RDNA1 64
|
3921
|
+
#define NWARPS_Q4_K_RDNA1 8
|
3693
3922
|
#define MMQ_X_Q4_K_AMPERE 64
|
3694
3923
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3695
3924
|
#define NWARPS_Q4_K_AMPERE 4
|
@@ -3698,14 +3927,33 @@ template <bool need_check> static __global__ void
|
|
3698
3927
|
#define NWARPS_Q4_K_PASCAL 8
|
3699
3928
|
|
3700
3929
|
template <bool need_check> static __global__ void
|
3701
|
-
#if
|
3930
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3931
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3932
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3933
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3934
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3702
3935
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3703
3936
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3704
3937
|
mul_mat_q4_K(
|
3705
3938
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3706
3939
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3707
3940
|
|
3708
|
-
#if
|
3941
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3942
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3943
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
3944
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
3945
|
+
const int nwarps = NWARPS_Q4_K_RDNA2;
|
3946
|
+
#else
|
3947
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
3948
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
3949
|
+
const int nwarps = NWARPS_Q4_K_RDNA1;
|
3950
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3951
|
+
|
3952
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3953
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3954
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3955
|
+
|
3956
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3709
3957
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3710
3958
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3711
3959
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3728,6 +3976,12 @@ template <bool need_check> static __global__ void
|
|
3728
3976
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3729
3977
|
}
|
3730
3978
|
|
3979
|
+
#define MMQ_X_Q5_K_RDNA2 64
|
3980
|
+
#define MMQ_Y_Q5_K_RDNA2 128
|
3981
|
+
#define NWARPS_Q5_K_RDNA2 8
|
3982
|
+
#define MMQ_X_Q5_K_RDNA1 32
|
3983
|
+
#define MMQ_Y_Q5_K_RDNA1 64
|
3984
|
+
#define NWARPS_Q5_K_RDNA1 8
|
3731
3985
|
#define MMQ_X_Q5_K_AMPERE 64
|
3732
3986
|
#define MMQ_Y_Q5_K_AMPERE 128
|
3733
3987
|
#define NWARPS_Q5_K_AMPERE 4
|
@@ -3735,11 +3989,32 @@ template <bool need_check> static __global__ void
|
|
3735
3989
|
#define MMQ_Y_Q5_K_PASCAL 64
|
3736
3990
|
#define NWARPS_Q5_K_PASCAL 8
|
3737
3991
|
|
3738
|
-
template <bool need_check> static __global__ void
|
3992
|
+
template <bool need_check> static __global__ void
|
3993
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3994
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3995
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
3996
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3997
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3998
|
+
mul_mat_q5_K(
|
3739
3999
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3740
4000
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3741
4001
|
|
3742
|
-
#if
|
4002
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4003
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4004
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
4005
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
4006
|
+
const int nwarps = NWARPS_Q5_K_RDNA2;
|
4007
|
+
#else
|
4008
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
4009
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
4010
|
+
const int nwarps = NWARPS_Q5_K_RDNA1;
|
4011
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4012
|
+
|
4013
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4014
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4015
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4016
|
+
|
4017
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3743
4018
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3744
4019
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3745
4020
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -3762,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3762
4037
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3763
4038
|
}
|
3764
4039
|
|
4040
|
+
#define MMQ_X_Q6_K_RDNA2 64
|
4041
|
+
#define MMQ_Y_Q6_K_RDNA2 128
|
4042
|
+
#define NWARPS_Q6_K_RDNA2 8
|
4043
|
+
#define MMQ_X_Q6_K_RDNA1 32
|
4044
|
+
#define MMQ_Y_Q6_K_RDNA1 64
|
4045
|
+
#define NWARPS_Q6_K_RDNA1 8
|
3765
4046
|
#define MMQ_X_Q6_K_AMPERE 64
|
3766
4047
|
#define MMQ_Y_Q6_K_AMPERE 64
|
3767
4048
|
#define NWARPS_Q6_K_AMPERE 4
|
@@ -3770,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3770
4051
|
#define NWARPS_Q6_K_PASCAL 8
|
3771
4052
|
|
3772
4053
|
template <bool need_check> static __global__ void
|
3773
|
-
#if
|
4054
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4055
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4056
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4057
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4058
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3774
4059
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3775
4060
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3776
4061
|
mul_mat_q6_K(
|
3777
4062
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3778
4063
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3779
4064
|
|
3780
|
-
#if
|
4065
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4066
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4067
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
4068
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
4069
|
+
const int nwarps = NWARPS_Q6_K_RDNA2;
|
4070
|
+
#else
|
4071
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
4072
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
4073
|
+
const int nwarps = NWARPS_Q6_K_RDNA1;
|
4074
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4075
|
+
|
4076
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4077
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4078
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4079
|
+
|
4080
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3781
4081
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3782
4082
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3783
4083
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4086,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4086
4386
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4087
4387
|
}
|
4088
4388
|
|
4089
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4389
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
4390
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4090
4391
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4091
4392
|
const int half_n_dims = ncols/4;
|
4092
4393
|
|
@@ -4098,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4098
4399
|
const int i = row*ncols + col;
|
4099
4400
|
|
4100
4401
|
const float col_theta_scale = powf(theta_scale, col);
|
4402
|
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
4101
4403
|
|
4102
|
-
const float theta = p*col_theta_scale;
|
4404
|
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
4103
4405
|
const float sin_theta = sinf(theta);
|
4104
4406
|
const float cos_theta = cosf(theta);
|
4105
4407
|
|
@@ -4109,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4109
4411
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4110
4412
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4111
4413
|
|
4112
|
-
const float block_theta =
|
4414
|
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
4113
4415
|
const float sin_block_theta = sinf(block_theta);
|
4114
4416
|
const float cos_block_theta = cosf(block_theta);
|
4115
4417
|
|
@@ -4558,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4558
4860
|
const int compute_capability = g_compute_capabilities[id];
|
4559
4861
|
|
4560
4862
|
int mmq_x, mmq_y, nwarps;
|
4561
|
-
if (compute_capability >=
|
4863
|
+
if (compute_capability >= CC_RDNA2) {
|
4864
|
+
mmq_x = MMQ_X_Q4_0_RDNA2;
|
4865
|
+
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
4866
|
+
nwarps = NWARPS_Q4_0_RDNA2;
|
4867
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4868
|
+
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4869
|
+
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4870
|
+
nwarps = NWARPS_Q4_0_RDNA1;
|
4871
|
+
} else if (compute_capability >= CC_TURING) {
|
4562
4872
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4563
4873
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4564
4874
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4595,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4595
4905
|
const int compute_capability = g_compute_capabilities[id];
|
4596
4906
|
|
4597
4907
|
int mmq_x, mmq_y, nwarps;
|
4598
|
-
if (compute_capability >=
|
4908
|
+
if (compute_capability >= CC_RDNA2) {
|
4909
|
+
mmq_x = MMQ_X_Q4_1_RDNA2;
|
4910
|
+
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
4911
|
+
nwarps = NWARPS_Q4_1_RDNA2;
|
4912
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4913
|
+
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4914
|
+
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4915
|
+
nwarps = NWARPS_Q4_1_RDNA1;
|
4916
|
+
} else if (compute_capability >= CC_TURING) {
|
4599
4917
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4600
4918
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4601
4919
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4632,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4632
4950
|
const int compute_capability = g_compute_capabilities[id];
|
4633
4951
|
|
4634
4952
|
int mmq_x, mmq_y, nwarps;
|
4635
|
-
if (compute_capability >=
|
4953
|
+
if (compute_capability >= CC_RDNA2) {
|
4954
|
+
mmq_x = MMQ_X_Q5_0_RDNA2;
|
4955
|
+
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
4956
|
+
nwarps = NWARPS_Q5_0_RDNA2;
|
4957
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4958
|
+
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4959
|
+
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4960
|
+
nwarps = NWARPS_Q5_0_RDNA1;
|
4961
|
+
} else if (compute_capability >= CC_TURING) {
|
4636
4962
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4637
4963
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4638
4964
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -4669,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4669
4995
|
const int compute_capability = g_compute_capabilities[id];
|
4670
4996
|
|
4671
4997
|
int mmq_x, mmq_y, nwarps;
|
4672
|
-
if (compute_capability >=
|
4998
|
+
if (compute_capability >= CC_RDNA2) {
|
4999
|
+
mmq_x = MMQ_X_Q5_1_RDNA2;
|
5000
|
+
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
5001
|
+
nwarps = NWARPS_Q5_1_RDNA2;
|
5002
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5003
|
+
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5004
|
+
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5005
|
+
nwarps = NWARPS_Q5_1_RDNA1;
|
5006
|
+
} else if (compute_capability >= CC_TURING) {
|
4673
5007
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4674
5008
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4675
5009
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -4706,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4706
5040
|
const int compute_capability = g_compute_capabilities[id];
|
4707
5041
|
|
4708
5042
|
int mmq_x, mmq_y, nwarps;
|
4709
|
-
if (compute_capability >=
|
5043
|
+
if (compute_capability >= CC_RDNA2) {
|
5044
|
+
mmq_x = MMQ_X_Q8_0_RDNA2;
|
5045
|
+
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
5046
|
+
nwarps = NWARPS_Q8_0_RDNA2;
|
5047
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5048
|
+
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5049
|
+
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5050
|
+
nwarps = NWARPS_Q8_0_RDNA1;
|
5051
|
+
} else if (compute_capability >= CC_TURING) {
|
4710
5052
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4711
5053
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4712
5054
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -4743,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4743
5085
|
const int compute_capability = g_compute_capabilities[id];
|
4744
5086
|
|
4745
5087
|
int mmq_x, mmq_y, nwarps;
|
4746
|
-
if (compute_capability >=
|
5088
|
+
if (compute_capability >= CC_RDNA2) {
|
5089
|
+
mmq_x = MMQ_X_Q2_K_RDNA2;
|
5090
|
+
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
5091
|
+
nwarps = NWARPS_Q2_K_RDNA2;
|
5092
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5093
|
+
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5094
|
+
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5095
|
+
nwarps = NWARPS_Q2_K_RDNA1;
|
5096
|
+
} else if (compute_capability >= CC_TURING) {
|
4747
5097
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4748
5098
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4749
5099
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -4782,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4782
5132
|
const int compute_capability = g_compute_capabilities[id];
|
4783
5133
|
|
4784
5134
|
int mmq_x, mmq_y, nwarps;
|
4785
|
-
if (compute_capability >=
|
5135
|
+
if (compute_capability >= CC_RDNA2) {
|
5136
|
+
mmq_x = MMQ_X_Q3_K_RDNA2;
|
5137
|
+
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
5138
|
+
nwarps = NWARPS_Q3_K_RDNA2;
|
5139
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5140
|
+
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5141
|
+
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5142
|
+
nwarps = NWARPS_Q3_K_RDNA1;
|
5143
|
+
} else if (compute_capability >= CC_TURING) {
|
4786
5144
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4787
5145
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4788
5146
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -4820,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4820
5178
|
const int compute_capability = g_compute_capabilities[id];
|
4821
5179
|
|
4822
5180
|
int mmq_x, mmq_y, nwarps;
|
4823
|
-
if (compute_capability >=
|
5181
|
+
if (compute_capability >= CC_RDNA2) {
|
5182
|
+
mmq_x = MMQ_X_Q4_K_RDNA2;
|
5183
|
+
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
5184
|
+
nwarps = NWARPS_Q4_K_RDNA2;
|
5185
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5186
|
+
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5187
|
+
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5188
|
+
nwarps = NWARPS_Q4_K_RDNA1;
|
5189
|
+
} else if (compute_capability >= CC_TURING) {
|
4824
5190
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4825
5191
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4826
5192
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4857,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4857
5223
|
const int compute_capability = g_compute_capabilities[id];
|
4858
5224
|
|
4859
5225
|
int mmq_x, mmq_y, nwarps;
|
4860
|
-
if (compute_capability >=
|
5226
|
+
if (compute_capability >= CC_RDNA2) {
|
5227
|
+
mmq_x = MMQ_X_Q5_K_RDNA2;
|
5228
|
+
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
5229
|
+
nwarps = NWARPS_Q5_K_RDNA2;
|
5230
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5231
|
+
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5232
|
+
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5233
|
+
nwarps = NWARPS_Q5_K_RDNA1;
|
5234
|
+
} else if (compute_capability >= CC_TURING) {
|
4861
5235
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4862
5236
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4863
5237
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4894,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4894
5268
|
const int compute_capability = g_compute_capabilities[id];
|
4895
5269
|
|
4896
5270
|
int mmq_x, mmq_y, nwarps;
|
4897
|
-
if (compute_capability >=
|
5271
|
+
if (compute_capability >= CC_RDNA2) {
|
5272
|
+
mmq_x = MMQ_X_Q6_K_RDNA2;
|
5273
|
+
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
5274
|
+
nwarps = NWARPS_Q6_K_RDNA2;
|
5275
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5276
|
+
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5277
|
+
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5278
|
+
nwarps = NWARPS_Q6_K_RDNA1;
|
5279
|
+
} else if (compute_capability >= CC_TURING) {
|
4898
5280
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4899
5281
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4900
5282
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4984,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
4984
5366
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4985
5367
|
}
|
4986
5368
|
|
4987
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4988
|
-
|
4989
|
-
|
4990
|
-
const
|
5369
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
5370
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5371
|
+
GGML_ASSERT(ncols % 4 == 0);
|
5372
|
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5373
|
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
4991
5374
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4992
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5375
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
4993
5376
|
}
|
4994
5377
|
|
4995
5378
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5127,25 +5510,30 @@ void ggml_init_cublas() {
|
|
5127
5510
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5128
5511
|
int64_t total_vram = 0;
|
5129
5512
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5130
|
-
for (
|
5513
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5131
5514
|
cudaDeviceProp prop;
|
5132
5515
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5133
|
-
fprintf(stderr, " Device %
|
5516
|
+
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5134
5517
|
|
5135
5518
|
g_tensor_split[id] = total_vram;
|
5136
5519
|
total_vram += prop.totalGlobalMem;
|
5137
|
-
|
5520
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5521
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
5522
|
+
#else
|
5138
5523
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5524
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5139
5525
|
}
|
5140
|
-
for (
|
5526
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5141
5527
|
g_tensor_split[id] /= total_vram;
|
5142
5528
|
}
|
5143
5529
|
|
5144
|
-
for (
|
5145
|
-
CUDA_CHECK(
|
5530
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5531
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
5146
5532
|
|
5147
|
-
// create
|
5148
|
-
|
5533
|
+
// create cuda streams
|
5534
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
5535
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5536
|
+
}
|
5149
5537
|
|
5150
5538
|
// create cublas handle
|
5151
5539
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -5214,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5214
5602
|
if (src->backend == GGML_BACKEND_CPU) {
|
5215
5603
|
kind = cudaMemcpyHostToDevice;
|
5216
5604
|
src_ptr = (char *) src->data;
|
5217
|
-
} else if (src->backend == GGML_BACKEND_GPU) {
|
5605
|
+
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5606
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5218
5607
|
kind = cudaMemcpyDeviceToDevice;
|
5219
5608
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5220
5609
|
int id;
|
@@ -5253,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5253
5642
|
}
|
5254
5643
|
|
5255
5644
|
inline void ggml_cuda_op_add(
|
5256
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5257
|
-
float *
|
5258
|
-
cudaStream_t & cudaStream_main){
|
5645
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5646
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5259
5647
|
|
5260
|
-
GGML_ASSERT(
|
5261
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5262
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5263
|
-
|
5264
|
-
const int64_t ne00 = src0->ne[0];
|
5265
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5648
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5266
5649
|
|
5267
5650
|
const int64_t ne10 = src1->ne[0];
|
5268
5651
|
const int64_t ne11 = src1->ne[1];
|
5269
5652
|
|
5270
|
-
// compute
|
5271
5653
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
5272
|
-
add_f32_cuda(
|
5654
|
+
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5273
5655
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5274
|
-
add_f16_f32_f16_cuda((half *)
|
5656
|
+
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
5275
5657
|
} else {
|
5276
5658
|
GGML_ASSERT(false);
|
5277
5659
|
}
|
5278
5660
|
|
5279
5661
|
(void) src1;
|
5280
5662
|
(void) dst;
|
5281
|
-
(void) src0_ddq_i;
|
5282
|
-
(void) i02;
|
5283
|
-
(void) i1;
|
5284
5663
|
}
|
5285
5664
|
|
5286
5665
|
inline void ggml_cuda_op_mul(
|
5287
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5288
|
-
float *
|
5289
|
-
cudaStream_t & cudaStream_main){
|
5290
|
-
|
5291
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5292
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5293
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5666
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5667
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5294
5668
|
|
5295
|
-
|
5296
|
-
|
5669
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5670
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5671
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5297
5672
|
|
5298
5673
|
const int64_t ne10 = src1->ne[0];
|
5299
5674
|
const int64_t ne11 = src1->ne[1];
|
5300
5675
|
|
5301
|
-
mul_f32_cuda(
|
5676
|
+
mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5302
5677
|
|
5303
5678
|
(void) dst;
|
5304
|
-
(void) src0_ddq_i;
|
5305
|
-
(void) i02;
|
5306
|
-
(void) i1;
|
5307
5679
|
}
|
5308
5680
|
|
5309
5681
|
inline void ggml_cuda_op_gelu(
|
5310
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5311
|
-
float *
|
5312
|
-
cudaStream_t & cudaStream_main){
|
5682
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5683
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5313
5684
|
|
5314
|
-
GGML_ASSERT(
|
5315
|
-
GGML_ASSERT(
|
5685
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5686
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5316
5687
|
|
5317
|
-
|
5318
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5319
|
-
|
5320
|
-
// compute
|
5321
|
-
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5688
|
+
gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5322
5689
|
|
5323
5690
|
(void) src1;
|
5324
5691
|
(void) dst;
|
5325
|
-
(void)
|
5326
|
-
(void) src1_ddf_i;
|
5327
|
-
(void) i02;
|
5328
|
-
(void) i1;
|
5692
|
+
(void) src1_dd;
|
5329
5693
|
}
|
5330
5694
|
|
5331
5695
|
inline void ggml_cuda_op_silu(
|
5332
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5333
|
-
float *
|
5334
|
-
cudaStream_t & cudaStream_main){
|
5696
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5697
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5335
5698
|
|
5336
|
-
GGML_ASSERT(
|
5337
|
-
GGML_ASSERT(
|
5699
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5700
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5338
5701
|
|
5339
|
-
|
5340
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5341
|
-
|
5342
|
-
// compute
|
5343
|
-
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5702
|
+
silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5344
5703
|
|
5345
5704
|
(void) src1;
|
5346
5705
|
(void) dst;
|
5347
|
-
(void)
|
5348
|
-
(void) src1_ddf_i;
|
5349
|
-
(void) i02;
|
5350
|
-
(void) i1;
|
5706
|
+
(void) src1_dd;
|
5351
5707
|
}
|
5352
5708
|
|
5353
5709
|
inline void ggml_cuda_op_norm(
|
5354
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5355
|
-
float *
|
5356
|
-
cudaStream_t & cudaStream_main){
|
5710
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5711
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5357
5712
|
|
5358
|
-
GGML_ASSERT(
|
5359
|
-
GGML_ASSERT(
|
5713
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5714
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5360
5715
|
|
5361
5716
|
const int64_t ne00 = src0->ne[0];
|
5362
|
-
const int64_t
|
5717
|
+
const int64_t nrows = ggml_nrows(src0);
|
5363
5718
|
|
5364
|
-
|
5365
|
-
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
5719
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5366
5720
|
|
5367
5721
|
(void) src1;
|
5368
5722
|
(void) dst;
|
5369
|
-
(void)
|
5370
|
-
(void) src1_ddf_i;
|
5371
|
-
(void) i02;
|
5372
|
-
(void) i1;
|
5723
|
+
(void) src1_dd;
|
5373
5724
|
}
|
5374
5725
|
|
5375
5726
|
inline void ggml_cuda_op_rms_norm(
|
5376
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5377
|
-
float *
|
5378
|
-
cudaStream_t & cudaStream_main){
|
5727
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5728
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5379
5729
|
|
5380
|
-
GGML_ASSERT(
|
5381
|
-
GGML_ASSERT(
|
5730
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5731
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5382
5732
|
|
5383
5733
|
const int64_t ne00 = src0->ne[0];
|
5384
|
-
const int64_t
|
5734
|
+
const int64_t nrows = ggml_nrows(src0);
|
5385
5735
|
|
5386
5736
|
float eps;
|
5387
5737
|
memcpy(&eps, dst->op_params, sizeof(float));
|
5388
5738
|
|
5389
|
-
|
5390
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
5739
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
5391
5740
|
|
5392
5741
|
(void) src1;
|
5393
5742
|
(void) dst;
|
5394
|
-
(void)
|
5395
|
-
(void) src1_ddf_i;
|
5396
|
-
(void) i02;
|
5397
|
-
(void) i1;
|
5743
|
+
(void) src1_dd;
|
5398
5744
|
}
|
5399
5745
|
|
5400
5746
|
inline void ggml_cuda_op_mul_mat_q(
|
5401
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5402
|
-
|
5403
|
-
cudaStream_t &
|
5404
|
-
|
5405
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5406
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5407
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5747
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5748
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5749
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5408
5750
|
|
5409
5751
|
const int64_t ne00 = src0->ne[0];
|
5410
5752
|
|
5411
5753
|
const int64_t ne10 = src1->ne[0];
|
5412
|
-
const int64_t ne11 = src1->ne[1];
|
5413
5754
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
5414
5755
|
|
5415
5756
|
const int64_t ne0 = dst->ne[0];
|
5416
5757
|
|
5417
|
-
const int64_t
|
5758
|
+
const int64_t row_diff = row_high - row_low;
|
5418
5759
|
|
5419
5760
|
int id;
|
5420
5761
|
CUDA_CHECK(cudaGetDevice(&id));
|
5421
5762
|
|
5422
5763
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5423
5764
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
5424
|
-
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
5425
|
-
|
5426
|
-
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
5427
|
-
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5428
|
-
size_t as;
|
5429
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
5430
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
5765
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5431
5766
|
|
5432
5767
|
switch (src0->type) {
|
5433
5768
|
case GGML_TYPE_Q4_0:
|
5434
|
-
ggml_mul_mat_q4_0_q8_1_cuda(
|
5769
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5435
5770
|
break;
|
5436
5771
|
case GGML_TYPE_Q4_1:
|
5437
|
-
ggml_mul_mat_q4_1_q8_1_cuda(
|
5772
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5438
5773
|
break;
|
5439
5774
|
case GGML_TYPE_Q5_0:
|
5440
|
-
ggml_mul_mat_q5_0_q8_1_cuda(
|
5775
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5441
5776
|
break;
|
5442
5777
|
case GGML_TYPE_Q5_1:
|
5443
|
-
ggml_mul_mat_q5_1_q8_1_cuda(
|
5778
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5444
5779
|
break;
|
5445
5780
|
case GGML_TYPE_Q8_0:
|
5446
|
-
ggml_mul_mat_q8_0_q8_1_cuda(
|
5781
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5447
5782
|
break;
|
5448
5783
|
case GGML_TYPE_Q2_K:
|
5449
|
-
ggml_mul_mat_q2_K_q8_1_cuda(
|
5784
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5450
5785
|
break;
|
5451
5786
|
case GGML_TYPE_Q3_K:
|
5452
|
-
ggml_mul_mat_q3_K_q8_1_cuda(
|
5787
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5453
5788
|
break;
|
5454
5789
|
case GGML_TYPE_Q4_K:
|
5455
|
-
ggml_mul_mat_q4_K_q8_1_cuda(
|
5790
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5456
5791
|
break;
|
5457
5792
|
case GGML_TYPE_Q5_K:
|
5458
|
-
ggml_mul_mat_q5_K_q8_1_cuda(
|
5793
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5459
5794
|
break;
|
5460
5795
|
case GGML_TYPE_Q6_K:
|
5461
|
-
ggml_mul_mat_q6_K_q8_1_cuda(
|
5796
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5462
5797
|
break;
|
5463
5798
|
default:
|
5464
5799
|
GGML_ASSERT(false);
|
5465
5800
|
break;
|
5466
5801
|
}
|
5467
5802
|
|
5468
|
-
ggml_cuda_pool_free(src1_q8_1, as);
|
5469
|
-
|
5470
5803
|
(void) src1;
|
5471
5804
|
(void) dst;
|
5472
|
-
(void)
|
5473
|
-
(void) i02;
|
5474
|
-
(void) i1;
|
5805
|
+
(void) src1_ddf_i;
|
5475
5806
|
}
|
5476
5807
|
|
5477
5808
|
static int64_t get_row_rounding(ggml_type type) {
|
5478
|
-
|
5479
|
-
|
5480
|
-
|
5481
|
-
|
5482
|
-
|
5809
|
+
int64_t min_compute_capability = INT_MAX;
|
5810
|
+
int64_t max_compute_capability = INT_MIN;
|
5811
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5812
|
+
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5813
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5814
|
+
min_compute_capability = g_compute_capabilities[id];
|
5815
|
+
}
|
5816
|
+
if (max_compute_capability < g_compute_capabilities[id]) {
|
5817
|
+
max_compute_capability = g_compute_capabilities[id];
|
5818
|
+
}
|
5483
5819
|
}
|
5484
5820
|
}
|
5485
5821
|
|
5822
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5823
|
+
switch(type) {
|
5824
|
+
case GGML_TYPE_Q4_0:
|
5825
|
+
case GGML_TYPE_Q4_1:
|
5826
|
+
case GGML_TYPE_Q5_0:
|
5827
|
+
case GGML_TYPE_Q5_1:
|
5828
|
+
case GGML_TYPE_Q8_0:
|
5829
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5830
|
+
case GGML_TYPE_F16:
|
5831
|
+
return 1;
|
5832
|
+
case GGML_TYPE_Q2_K:
|
5833
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
5834
|
+
case GGML_TYPE_Q3_K:
|
5835
|
+
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
5836
|
+
case GGML_TYPE_Q4_K:
|
5837
|
+
case GGML_TYPE_Q5_K:
|
5838
|
+
case GGML_TYPE_Q6_K:
|
5839
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5840
|
+
default:
|
5841
|
+
GGML_ASSERT(false);
|
5842
|
+
}
|
5843
|
+
#else
|
5486
5844
|
switch(type) {
|
5487
5845
|
case GGML_TYPE_Q4_0:
|
5488
5846
|
case GGML_TYPE_Q4_1:
|
@@ -5503,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5503
5861
|
default:
|
5504
5862
|
GGML_ASSERT(false);
|
5505
5863
|
}
|
5864
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5506
5865
|
}
|
5507
5866
|
|
5508
|
-
inline void
|
5509
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5510
|
-
|
5511
|
-
cudaStream_t &
|
5512
|
-
|
5513
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5514
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5515
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5867
|
+
inline void ggml_cuda_op_mul_mat_vec_q(
|
5868
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5869
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5870
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5516
5871
|
|
5517
5872
|
const int64_t ne00 = src0->ne[0];
|
5518
|
-
const int64_t
|
5873
|
+
const int64_t row_diff = row_high - row_low;
|
5519
5874
|
|
5520
|
-
|
5521
|
-
|
5522
|
-
|
5523
|
-
|
5524
|
-
|
5525
|
-
|
5875
|
+
switch (src0->type) {
|
5876
|
+
case GGML_TYPE_Q4_0:
|
5877
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5878
|
+
break;
|
5879
|
+
case GGML_TYPE_Q4_1:
|
5880
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5881
|
+
break;
|
5882
|
+
case GGML_TYPE_Q5_0:
|
5883
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5884
|
+
break;
|
5885
|
+
case GGML_TYPE_Q5_1:
|
5886
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5887
|
+
break;
|
5888
|
+
case GGML_TYPE_Q8_0:
|
5889
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5890
|
+
break;
|
5891
|
+
case GGML_TYPE_Q2_K:
|
5892
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5893
|
+
break;
|
5894
|
+
case GGML_TYPE_Q3_K:
|
5895
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5896
|
+
break;
|
5897
|
+
case GGML_TYPE_Q4_K:
|
5898
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5899
|
+
break;
|
5900
|
+
case GGML_TYPE_Q5_K:
|
5901
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5902
|
+
break;
|
5903
|
+
case GGML_TYPE_Q6_K:
|
5904
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5905
|
+
break;
|
5906
|
+
default:
|
5907
|
+
GGML_ASSERT(false);
|
5908
|
+
break;
|
5909
|
+
}
|
5526
5910
|
|
5527
|
-
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5531
|
-
|
5532
|
-
|
5533
|
-
#if QK_K == 256
|
5534
|
-
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
5535
|
-
src0->type == GGML_TYPE_Q2_K ||
|
5536
|
-
src0->type == GGML_TYPE_Q3_K ||
|
5537
|
-
src0->type == GGML_TYPE_Q4_K ||
|
5538
|
-
src0->type == GGML_TYPE_Q5_K ||
|
5539
|
-
src0->type == GGML_TYPE_Q6_K;
|
5540
|
-
#endif // QK_K == 256
|
5541
|
-
|
5542
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
5543
|
-
#endif
|
5911
|
+
(void) src1;
|
5912
|
+
(void) dst;
|
5913
|
+
(void) src1_ddf_i;
|
5914
|
+
(void) src1_ncols;
|
5915
|
+
(void) src1_padded_row_size;
|
5916
|
+
}
|
5544
5917
|
|
5545
|
-
|
5546
|
-
|
5547
|
-
|
5548
|
-
|
5549
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
5550
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
5551
|
-
|
5552
|
-
switch (src0->type) {
|
5553
|
-
case GGML_TYPE_Q4_0:
|
5554
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5555
|
-
break;
|
5556
|
-
case GGML_TYPE_Q4_1:
|
5557
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5558
|
-
break;
|
5559
|
-
case GGML_TYPE_Q5_0:
|
5560
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5561
|
-
break;
|
5562
|
-
case GGML_TYPE_Q5_1:
|
5563
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5564
|
-
break;
|
5565
|
-
case GGML_TYPE_Q8_0:
|
5566
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5567
|
-
break;
|
5568
|
-
case GGML_TYPE_Q2_K:
|
5569
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5570
|
-
break;
|
5571
|
-
case GGML_TYPE_Q3_K:
|
5572
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5573
|
-
break;
|
5574
|
-
case GGML_TYPE_Q4_K:
|
5575
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5576
|
-
break;
|
5577
|
-
case GGML_TYPE_Q5_K:
|
5578
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5579
|
-
break;
|
5580
|
-
case GGML_TYPE_Q6_K:
|
5581
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5582
|
-
break;
|
5583
|
-
default:
|
5584
|
-
GGML_ASSERT(false);
|
5585
|
-
break;
|
5586
|
-
}
|
5918
|
+
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
5919
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5920
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5921
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5587
5922
|
|
5588
|
-
|
5589
|
-
|
5590
|
-
|
5923
|
+
const int64_t ne00 = src0->ne[0];
|
5924
|
+
const int64_t row_diff = row_high - row_low;
|
5925
|
+
|
5926
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
5591
5927
|
#ifdef GGML_CUDA_F16
|
5592
|
-
|
5593
|
-
|
5594
|
-
|
5595
|
-
|
5596
|
-
|
5597
|
-
|
5598
|
-
|
5599
|
-
|
5600
|
-
|
5601
|
-
|
5602
|
-
|
5603
|
-
|
5604
|
-
|
5928
|
+
size_t ash;
|
5929
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
5930
|
+
|
5931
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
5932
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
5933
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
5934
|
+
|
5935
|
+
if (src1_convert_f16) {
|
5936
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
5937
|
+
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
5938
|
+
ne00, 1, sizeof(float), 0, 0,
|
5939
|
+
ne00, 1, sizeof(half), 0, 0, stream);
|
5940
|
+
}
|
5605
5941
|
#else
|
5606
|
-
|
5942
|
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
5607
5943
|
#endif // GGML_CUDA_F16
|
5608
5944
|
|
5609
|
-
|
5610
|
-
|
5611
|
-
|
5612
|
-
|
5613
|
-
|
5614
|
-
|
5615
|
-
|
5616
|
-
|
5617
|
-
|
5618
|
-
|
5619
|
-
|
5620
|
-
|
5621
|
-
|
5622
|
-
|
5623
|
-
|
5624
|
-
|
5625
|
-
|
5626
|
-
|
5627
|
-
|
5628
|
-
|
5629
|
-
|
5630
|
-
|
5631
|
-
|
5632
|
-
|
5633
|
-
|
5634
|
-
|
5635
|
-
|
5636
|
-
|
5637
|
-
|
5638
|
-
|
5639
|
-
|
5640
|
-
|
5641
|
-
|
5642
|
-
|
5643
|
-
|
5644
|
-
|
5645
|
-
|
5646
|
-
|
5945
|
+
switch (src0->type) {
|
5946
|
+
case GGML_TYPE_Q4_0:
|
5947
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5948
|
+
break;
|
5949
|
+
case GGML_TYPE_Q4_1:
|
5950
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5951
|
+
break;
|
5952
|
+
case GGML_TYPE_Q5_0:
|
5953
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5954
|
+
break;
|
5955
|
+
case GGML_TYPE_Q5_1:
|
5956
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5957
|
+
break;
|
5958
|
+
case GGML_TYPE_Q8_0:
|
5959
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5960
|
+
break;
|
5961
|
+
case GGML_TYPE_Q2_K:
|
5962
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5963
|
+
break;
|
5964
|
+
case GGML_TYPE_Q3_K:
|
5965
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5966
|
+
break;
|
5967
|
+
case GGML_TYPE_Q4_K:
|
5968
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5969
|
+
break;
|
5970
|
+
case GGML_TYPE_Q5_K:
|
5971
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5972
|
+
break;
|
5973
|
+
case GGML_TYPE_Q6_K:
|
5974
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5975
|
+
break;
|
5976
|
+
case GGML_TYPE_F16:
|
5977
|
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5978
|
+
break;
|
5979
|
+
default:
|
5980
|
+
GGML_ASSERT(false);
|
5981
|
+
break;
|
5982
|
+
}
|
5647
5983
|
|
5648
5984
|
#ifdef GGML_CUDA_F16
|
5649
|
-
|
5650
|
-
|
5651
|
-
}
|
5652
|
-
#endif // GGML_CUDA_F16
|
5985
|
+
if (src1_convert_f16) {
|
5986
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
5653
5987
|
}
|
5988
|
+
#endif // GGML_CUDA_F16
|
5654
5989
|
|
5655
5990
|
(void) src1;
|
5656
5991
|
(void) dst;
|
5657
|
-
(void)
|
5658
|
-
(void)
|
5659
|
-
(void)
|
5992
|
+
(void) src1_ddq_i;
|
5993
|
+
(void) src1_ncols;
|
5994
|
+
(void) src1_padded_row_size;
|
5660
5995
|
}
|
5661
5996
|
|
5662
5997
|
inline void ggml_cuda_op_mul_mat_cublas(
|
5663
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5664
|
-
|
5665
|
-
cudaStream_t &
|
5998
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5999
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6000
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5666
6001
|
|
5667
|
-
GGML_ASSERT(
|
6002
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
5668
6003
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
5669
|
-
GGML_ASSERT(
|
6004
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
5670
6005
|
|
5671
6006
|
const float alpha = 1.0f;
|
5672
6007
|
const float beta = 0.0f;
|
@@ -5674,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
5674
6009
|
const int64_t ne00 = src0->ne[0];
|
5675
6010
|
|
5676
6011
|
const int64_t ne10 = src1->ne[0];
|
5677
|
-
const int64_t ne11 = src1->ne[1];
|
5678
6012
|
|
5679
6013
|
const int64_t ne0 = dst->ne[0];
|
5680
|
-
const int64_t
|
6014
|
+
const int64_t row_diff = row_high - row_low;
|
6015
|
+
|
6016
|
+
float * src0_ddq_as_f32;
|
6017
|
+
size_t src0_as = 0;
|
6018
|
+
|
6019
|
+
if (src0->type != GGML_TYPE_F32) {
|
6020
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6021
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6022
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6023
|
+
}
|
6024
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
5681
6025
|
|
5682
6026
|
int id;
|
5683
6027
|
CUDA_CHECK(cudaGetDevice(&id));
|
5684
6028
|
|
5685
6029
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5686
6030
|
// ldc == nrows of the matrix that cuBLAS writes into
|
5687
|
-
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
6031
|
+
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5688
6032
|
|
5689
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id],
|
6033
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
5690
6034
|
CUBLAS_CHECK(
|
5691
6035
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
5692
|
-
|
6036
|
+
row_diff, src1_ncols, ne10,
|
5693
6037
|
&alpha, src0_ddf_i, ne00,
|
5694
|
-
src1_ddf_i,
|
5695
|
-
&beta,
|
6038
|
+
src1_ddf_i, ne10,
|
6039
|
+
&beta, dst_dd_i, ldc));
|
6040
|
+
|
6041
|
+
if (src0_as > 0) {
|
6042
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6043
|
+
}
|
5696
6044
|
|
5697
6045
|
(void) dst;
|
5698
|
-
(void)
|
5699
|
-
(void)
|
5700
|
-
(void) i1;
|
6046
|
+
(void) src1_ddq_i;
|
6047
|
+
(void) src1_padded_row_size;
|
5701
6048
|
}
|
5702
6049
|
|
5703
6050
|
inline void ggml_cuda_op_rope(
|
5704
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5705
|
-
float *
|
5706
|
-
cudaStream_t & cudaStream_main){
|
6051
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6052
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5707
6053
|
|
5708
|
-
GGML_ASSERT(
|
5709
|
-
GGML_ASSERT(
|
6054
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6055
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5710
6056
|
|
5711
6057
|
const int64_t ne00 = src0->ne[0];
|
5712
6058
|
const int64_t ne01 = src0->ne[1];
|
5713
|
-
const int64_t
|
6059
|
+
const int64_t nrows = ggml_nrows(src0);
|
5714
6060
|
|
5715
6061
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5716
6062
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
@@ -5723,44 +6069,37 @@ inline void ggml_cuda_op_rope(
|
|
5723
6069
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
5724
6070
|
|
5725
6071
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6072
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5726
6073
|
|
5727
6074
|
const bool is_neox = mode & 2;
|
5728
6075
|
const bool is_glm = mode & 4;
|
5729
6076
|
|
5730
6077
|
// compute
|
5731
6078
|
if (is_glm) {
|
5732
|
-
|
5733
|
-
const float id_p = min(p, n_ctx - 2.f);
|
5734
|
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5735
|
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
6079
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
5736
6080
|
} else if (is_neox) {
|
5737
6081
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5738
|
-
|
5739
|
-
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6082
|
+
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5740
6083
|
} else {
|
5741
|
-
|
5742
|
-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6084
|
+
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5743
6085
|
}
|
5744
6086
|
|
5745
6087
|
(void) src1;
|
5746
6088
|
(void) dst;
|
5747
|
-
(void)
|
5748
|
-
(void) src1_ddf_i;
|
5749
|
-
(void) i1;
|
6089
|
+
(void) src1_dd;
|
5750
6090
|
}
|
5751
6091
|
|
5752
6092
|
inline void ggml_cuda_op_alibi(
|
5753
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5754
|
-
float *
|
5755
|
-
cudaStream_t & cudaStream_main){
|
6093
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6094
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5756
6095
|
|
5757
|
-
GGML_ASSERT(
|
5758
|
-
GGML_ASSERT(
|
6096
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6097
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5759
6098
|
|
5760
6099
|
const int64_t ne00 = src0->ne[0];
|
5761
6100
|
const int64_t ne01 = src0->ne[1];
|
5762
6101
|
const int64_t ne02 = src0->ne[2];
|
5763
|
-
const int64_t
|
6102
|
+
const int64_t nrows = ggml_nrows(src0);
|
5764
6103
|
|
5765
6104
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5766
6105
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
@@ -5775,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
|
|
5775
6114
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5776
6115
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5777
6116
|
|
5778
|
-
|
5779
|
-
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
6117
|
+
alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
5780
6118
|
|
5781
6119
|
(void) src1;
|
5782
|
-
(void)
|
5783
|
-
(void) src1_ddf_i;
|
5784
|
-
(void) i1;
|
6120
|
+
(void) src1_dd;
|
5785
6121
|
}
|
5786
6122
|
|
5787
6123
|
inline void ggml_cuda_op_diag_mask_inf(
|
5788
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5789
|
-
float *
|
5790
|
-
cudaStream_t & cudaStream_main){
|
6124
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6125
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5791
6126
|
|
5792
|
-
GGML_ASSERT(
|
5793
|
-
GGML_ASSERT(
|
6127
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6128
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5794
6129
|
|
5795
6130
|
const int64_t ne00 = src0->ne[0];
|
5796
6131
|
const int64_t ne01 = src0->ne[1];
|
5797
|
-
const
|
6132
|
+
const int nrows0 = ggml_nrows(src0);
|
5798
6133
|
|
5799
6134
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5800
6135
|
|
5801
|
-
|
5802
|
-
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
6136
|
+
diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
5803
6137
|
|
5804
6138
|
(void) src1;
|
5805
6139
|
(void) dst;
|
5806
|
-
(void)
|
5807
|
-
(void) src1_ddf_i;
|
5808
|
-
(void) i02;
|
5809
|
-
(void) i1;
|
6140
|
+
(void) src1_dd;
|
5810
6141
|
}
|
5811
6142
|
|
5812
6143
|
inline void ggml_cuda_op_soft_max(
|
5813
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5814
|
-
float *
|
5815
|
-
cudaStream_t & cudaStream_main){
|
6144
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6145
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5816
6146
|
|
5817
|
-
GGML_ASSERT(
|
5818
|
-
GGML_ASSERT(
|
6147
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6148
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5819
6149
|
|
5820
6150
|
const int64_t ne00 = src0->ne[0];
|
5821
|
-
const int64_t
|
6151
|
+
const int64_t nrows = ggml_nrows(src0);
|
5822
6152
|
|
5823
|
-
|
5824
|
-
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
6153
|
+
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5825
6154
|
|
5826
6155
|
(void) src1;
|
5827
6156
|
(void) dst;
|
5828
|
-
(void)
|
5829
|
-
(void) src1_ddf_i;
|
5830
|
-
(void) i02;
|
5831
|
-
(void) i1;
|
6157
|
+
(void) src1_dd;
|
5832
6158
|
}
|
5833
6159
|
|
5834
6160
|
inline void ggml_cuda_op_scale(
|
5835
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5836
|
-
float *
|
5837
|
-
cudaStream_t & cudaStream_main){
|
6161
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6162
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5838
6163
|
|
5839
|
-
GGML_ASSERT(
|
5840
|
-
GGML_ASSERT(
|
6164
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6165
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6166
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5841
6167
|
|
5842
6168
|
const float scale = ((float *) src1->data)[0];
|
5843
6169
|
|
5844
|
-
|
5845
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5846
|
-
|
5847
|
-
// compute
|
5848
|
-
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
6170
|
+
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
5849
6171
|
CUDA_CHECK(cudaGetLastError());
|
5850
6172
|
|
5851
6173
|
(void) src1;
|
5852
6174
|
(void) dst;
|
5853
|
-
(void)
|
5854
|
-
|
5855
|
-
|
5856
|
-
|
6175
|
+
(void) src1_dd;
|
6176
|
+
}
|
6177
|
+
|
6178
|
+
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6179
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
6180
|
+
|
6181
|
+
const bool use_src1 = src1 != nullptr;
|
6182
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6183
|
+
|
6184
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6185
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6186
|
+
|
6187
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6188
|
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6189
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6190
|
+
|
6191
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6192
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
6193
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
6194
|
+
|
6195
|
+
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
6196
|
+
|
6197
|
+
// dd = data device
|
6198
|
+
float * src0_ddf = nullptr;
|
6199
|
+
float * src1_ddf = nullptr;
|
6200
|
+
float * dst_ddf = nullptr;
|
6201
|
+
|
6202
|
+
// as = actual size
|
6203
|
+
size_t src0_asf = 0;
|
6204
|
+
size_t src1_asf = 0;
|
6205
|
+
size_t dst_asf = 0;
|
6206
|
+
|
6207
|
+
ggml_cuda_set_device(g_main_device);
|
6208
|
+
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6209
|
+
|
6210
|
+
if (src0_on_device) {
|
6211
|
+
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
6212
|
+
} else {
|
6213
|
+
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
6214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
6215
|
+
}
|
6216
|
+
|
6217
|
+
if (use_src1 && !src1_stays_on_host) {
|
6218
|
+
if (src1_on_device) {
|
6219
|
+
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6220
|
+
} else {
|
6221
|
+
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
6222
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
6223
|
+
}
|
6224
|
+
}
|
6225
|
+
if (dst_on_device) {
|
6226
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6227
|
+
} else {
|
6228
|
+
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
6229
|
+
}
|
6230
|
+
|
6231
|
+
// do the computation
|
6232
|
+
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
6233
|
+
CUDA_CHECK(cudaGetLastError());
|
6234
|
+
|
6235
|
+
// copy dst to host if necessary
|
6236
|
+
if (!dst_on_device) {
|
6237
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
6238
|
+
}
|
6239
|
+
|
6240
|
+
if (src0_asf > 0) {
|
6241
|
+
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
6242
|
+
}
|
6243
|
+
if (src1_asf > 0) {
|
6244
|
+
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
6245
|
+
}
|
6246
|
+
if (dst_asf > 0) {
|
6247
|
+
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
6251
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
6252
|
+
}
|
5857
6253
|
}
|
5858
6254
|
|
5859
|
-
static void
|
5860
|
-
|
6255
|
+
static void ggml_cuda_op_mul_mat(
|
6256
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
|
+
const bool convert_src1_to_q8_1) {
|
6258
|
+
|
5861
6259
|
const int64_t ne00 = src0->ne[0];
|
5862
6260
|
const int64_t ne01 = src0->ne[1];
|
5863
6261
|
const int64_t ne02 = src0->ne[2];
|
5864
6262
|
const int64_t ne03 = src0->ne[3];
|
5865
6263
|
const int64_t nrows0 = ggml_nrows(src0);
|
5866
6264
|
|
5867
|
-
const
|
5868
|
-
const int64_t
|
5869
|
-
const int64_t
|
5870
|
-
const int64_t
|
5871
|
-
const int64_t
|
5872
|
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6265
|
+
const int64_t ne10 = src1->ne[0];
|
6266
|
+
const int64_t ne11 = src1->ne[1];
|
6267
|
+
const int64_t ne12 = src1->ne[2];
|
6268
|
+
const int64_t ne13 = src1->ne[3];
|
6269
|
+
const int64_t nrows1 = ggml_nrows(src1);
|
5873
6270
|
|
5874
6271
|
GGML_ASSERT(ne03 == ne13);
|
5875
6272
|
|
5876
6273
|
const int64_t ne0 = dst->ne[0];
|
5877
6274
|
const int64_t ne1 = dst->ne[1];
|
5878
6275
|
|
5879
|
-
const int nb2
|
5880
|
-
const int nb3
|
6276
|
+
const int nb2 = dst->nb[2];
|
6277
|
+
const int nb3 = dst->nb[3];
|
5881
6278
|
|
5882
6279
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
5883
|
-
GGML_ASSERT(
|
6280
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
5884
6281
|
|
5885
|
-
|
5886
|
-
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
5887
|
-
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
5888
|
-
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
5889
|
-
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
5890
|
-
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
5891
|
-
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
6282
|
+
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
5892
6283
|
|
5893
|
-
const int64_t
|
5894
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
5895
|
-
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
5896
|
-
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
5897
|
-
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
6284
|
+
const int64_t i02_divisor = ne12 / ne02;
|
5898
6285
|
|
5899
6286
|
const size_t src0_ts = ggml_type_size(src0->type);
|
5900
6287
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
6288
|
+
const size_t q8_1_ts = sizeof(block_q8_1);
|
6289
|
+
const size_t q8_1_bs = QK8_1;
|
5901
6290
|
|
5902
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
5903
|
-
struct ggml_tensor_extra_gpu * src1_extra =
|
5904
|
-
struct ggml_tensor_extra_gpu *
|
6291
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6292
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6293
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
5905
6294
|
|
5906
6295
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
5907
6296
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
5908
|
-
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
5909
6297
|
|
5910
|
-
const bool src1_is_contiguous =
|
5911
|
-
const
|
5912
|
-
|
6298
|
+
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
6299
|
+
const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
6300
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5913
6301
|
|
5914
6302
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6303
|
+
GGML_ASSERT(!(split && ne02 > 1));
|
6304
|
+
GGML_ASSERT(!(split && ne03 > 1));
|
5915
6305
|
GGML_ASSERT(!(split && ne02 < ne12));
|
5916
6306
|
|
5917
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
5918
|
-
|
5919
6307
|
// dd = data device
|
5920
|
-
char *
|
5921
|
-
float *
|
5922
|
-
|
5923
|
-
float *
|
5924
|
-
|
5925
|
-
//
|
5926
|
-
size_t
|
5927
|
-
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6308
|
+
char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6309
|
+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
6310
|
+
char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
|
6311
|
+
float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6312
|
+
|
6313
|
+
// as = actual size
|
6314
|
+
size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5928
6315
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5929
|
-
size_t
|
6316
|
+
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
6317
|
+
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5930
6318
|
|
5931
|
-
|
5932
|
-
|
5933
|
-
if (split && g_device_count > 1) {
|
5934
|
-
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5935
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
5936
|
-
}
|
6319
|
+
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6320
|
+
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
5937
6321
|
|
5938
|
-
for (
|
5939
|
-
|
5940
|
-
|
5941
|
-
|
6322
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6323
|
+
// by default, use all rows
|
6324
|
+
row_low[id] = 0;
|
6325
|
+
row_high[id] = ne01;
|
5942
6326
|
|
5943
|
-
|
5944
|
-
|
5945
|
-
|
5946
|
-
int64_t row_low, row_high;
|
6327
|
+
// for multi GPU, get the row boundaries from tensor split
|
6328
|
+
// and round to mul_mat_q tile sizes
|
5947
6329
|
if (split) {
|
5948
6330
|
const int64_t rounding = get_row_rounding(src0->type);
|
5949
6331
|
|
5950
|
-
|
5951
|
-
|
6332
|
+
if (id != 0) {
|
6333
|
+
row_low[id] = ne01*g_tensor_split[id];
|
6334
|
+
row_low[id] -= row_low[id] % rounding;
|
6335
|
+
}
|
5952
6336
|
|
5953
|
-
if (id
|
5954
|
-
row_high
|
5955
|
-
|
5956
|
-
row_high = nrows0*g_tensor_split[id + 1];
|
5957
|
-
row_high -= row_high % rounding;
|
6337
|
+
if (id != g_device_count - 1) {
|
6338
|
+
row_high[id] = ne01*g_tensor_split[id + 1];
|
6339
|
+
row_high[id] -= row_high[id] % rounding;
|
5958
6340
|
}
|
5959
|
-
} else {
|
5960
|
-
row_low = 0;
|
5961
|
-
row_high = nrows0*i02_divisor;
|
5962
6341
|
}
|
5963
|
-
|
6342
|
+
}
|
6343
|
+
|
6344
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6345
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
5964
6346
|
continue;
|
5965
6347
|
}
|
5966
6348
|
|
5967
|
-
|
5968
|
-
|
5969
|
-
cudaSetDevice(id);
|
5970
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
6349
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6350
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5971
6351
|
|
5972
|
-
|
5973
|
-
|
5974
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
5975
|
-
}
|
6352
|
+
ggml_cuda_set_device(id);
|
6353
|
+
const cudaStream_t stream = g_cudaStreams[id][0];
|
5976
6354
|
|
5977
6355
|
if (src0_on_device && src0_is_contiguous) {
|
5978
|
-
|
5979
|
-
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
5980
|
-
} else {
|
5981
|
-
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
5982
|
-
}
|
6356
|
+
src0_dd[id] = (char *) src0_extra->data_device[id];
|
5983
6357
|
} else {
|
5984
|
-
|
5985
|
-
|
5986
|
-
} else {
|
5987
|
-
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
5988
|
-
}
|
6358
|
+
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6359
|
+
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
5989
6360
|
}
|
5990
6361
|
|
5991
|
-
if (
|
5992
|
-
|
6362
|
+
if (src1_on_device && src1_is_contiguous) {
|
6363
|
+
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
6364
|
+
} else {
|
6365
|
+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
5993
6366
|
}
|
5994
6367
|
|
5995
|
-
if (
|
5996
|
-
|
5997
|
-
|
5998
|
-
|
5999
|
-
src1_ddf[id]
|
6368
|
+
if (convert_src1_to_q8_1) {
|
6369
|
+
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6370
|
+
|
6371
|
+
if (split && src1_on_device && src1_is_contiguous) {
|
6372
|
+
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6373
|
+
CUDA_CHECK(cudaGetLastError());
|
6000
6374
|
}
|
6001
6375
|
}
|
6376
|
+
|
6002
6377
|
if (dst_on_device) {
|
6003
|
-
|
6378
|
+
dst_dd[id] = (float *) dst_extra->data_device[id];
|
6004
6379
|
} else {
|
6005
|
-
size_t size_dst_ddf = split ?
|
6006
|
-
|
6380
|
+
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
6381
|
+
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
6007
6382
|
}
|
6383
|
+
}
|
6008
6384
|
|
6009
|
-
|
6010
|
-
|
6011
|
-
|
6012
|
-
|
6385
|
+
// if multiple devices are used they need to wait for the main device
|
6386
|
+
// here an event is recorded that signals that the main device has finished calculating the input data
|
6387
|
+
if (split && g_device_count > 1) {
|
6388
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6389
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6390
|
+
}
|
6013
6391
|
|
6014
|
-
|
6392
|
+
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6393
|
+
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6394
|
+
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6395
|
+
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
6015
6396
|
|
6016
|
-
|
6017
|
-
|
6018
|
-
|
6397
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6398
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
6399
|
+
continue;
|
6400
|
+
}
|
6019
6401
|
|
6020
|
-
|
6021
|
-
|
6022
|
-
|
6023
|
-
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
6024
|
-
continue;
|
6025
|
-
}
|
6026
|
-
if (i0 == i0_offset_low) {
|
6027
|
-
i01_low = row_low % rows_per_iter;
|
6028
|
-
}
|
6029
|
-
if (i0 == i0_offset_high) {
|
6030
|
-
i01_high = row_high % rows_per_iter;
|
6031
|
-
}
|
6032
|
-
}
|
6402
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6403
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6404
|
+
const int64_t row_diff = row_high[id] - row_low[id];
|
6033
6405
|
|
6034
|
-
|
6035
|
-
|
6036
|
-
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
6037
|
-
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
6038
|
-
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
6039
|
-
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
6406
|
+
ggml_cuda_set_device(id);
|
6407
|
+
const cudaStream_t stream = g_cudaStreams[id][is];
|
6040
6408
|
|
6041
|
-
|
6042
|
-
|
6043
|
-
|
6044
|
-
|
6045
|
-
|
6409
|
+
// wait for main GPU data if necessary
|
6410
|
+
if (split && (id != g_main_device || is != 0)) {
|
6411
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6412
|
+
}
|
6413
|
+
|
6414
|
+
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
6415
|
+
const int64_t i03 = i0 / ne12;
|
6416
|
+
const int64_t i02 = i0 % ne12;
|
6417
|
+
|
6418
|
+
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
6046
6419
|
|
6047
6420
|
// for split tensors the data begins at i0 == i0_offset_low
|
6048
|
-
char *
|
6049
|
-
float *
|
6050
|
-
|
6051
|
-
float *
|
6052
|
-
|
6053
|
-
// for split tensors the data pointer needs to be rounded down
|
6054
|
-
// to the bin edge for i03, i02 bins beyond the first
|
6055
|
-
if (i0 - i0_offset_low > 0) {
|
6056
|
-
GGML_ASSERT(!flatten_rows);
|
6057
|
-
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
6058
|
-
src0_ddf_i -= (row_low % ne01)*ne00;
|
6059
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
6060
|
-
}
|
6421
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
6422
|
+
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
6423
|
+
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
6424
|
+
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
6061
6425
|
|
6062
6426
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
6063
6427
|
// in that case an offset on dst_ddf_i is needed
|
6064
6428
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
6065
|
-
|
6429
|
+
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
6066
6430
|
}
|
6067
6431
|
|
6068
6432
|
// copy src0, src1 to device if necessary
|
6069
|
-
if (
|
6070
|
-
if (
|
6071
|
-
|
6072
|
-
|
6073
|
-
|
6074
|
-
|
6075
|
-
|
6076
|
-
GGML_ASSERT(!flatten_rows);
|
6433
|
+
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
6434
|
+
if (id != g_main_device) {
|
6435
|
+
if (convert_src1_to_q8_1) {
|
6436
|
+
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
6437
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
6438
|
+
cudaMemcpyDeviceToDevice, stream));
|
6439
|
+
} else {
|
6077
6440
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
6078
|
-
src1_ddf_i_source +=
|
6079
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source,
|
6080
|
-
cudaMemcpyDeviceToDevice,
|
6441
|
+
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
6442
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
6443
|
+
cudaMemcpyDeviceToDevice, stream));
|
6081
6444
|
}
|
6082
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
6083
|
-
GGML_ASSERT(!split);
|
6084
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
6085
|
-
} else {
|
6086
|
-
GGML_ASSERT(false);
|
6087
6445
|
}
|
6446
|
+
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
6447
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
6448
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
6449
|
+
} else {
|
6450
|
+
GGML_ASSERT(false);
|
6088
6451
|
}
|
6089
6452
|
|
6090
|
-
if (
|
6091
|
-
|
6092
|
-
|
6093
|
-
} else {
|
6094
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
6095
|
-
}
|
6453
|
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6454
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6455
|
+
CUDA_CHECK(cudaGetLastError());
|
6096
6456
|
}
|
6097
6457
|
|
6098
|
-
|
6099
|
-
|
6100
|
-
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
6101
|
-
CUDA_CHECK(cudaGetLastError());
|
6458
|
+
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
6459
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
6102
6460
|
}
|
6103
6461
|
|
6104
6462
|
// do the computation
|
6105
|
-
op(src0, src1, dst,
|
6463
|
+
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
6464
|
+
row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
|
6106
6465
|
CUDA_CHECK(cudaGetLastError());
|
6107
6466
|
|
6108
6467
|
// copy dst to host or other device if necessary
|
@@ -6124,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
6124
6483
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
6125
6484
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
6126
6485
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
6127
|
-
float * dhf_dst_i = (float *) ((char *) dst_off_device +
|
6128
|
-
|
6129
|
-
|
6486
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6487
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6488
|
+
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
6489
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
6490
|
+
row_diff*sizeof(float), src1_ncols, kind, stream));
|
6130
6491
|
} else {
|
6131
6492
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6132
|
-
|
6493
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6494
|
+
dhf_dst_i += src1_col_0*ne0;
|
6495
|
+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
6133
6496
|
}
|
6134
6497
|
}
|
6135
6498
|
|
6136
|
-
//
|
6137
|
-
if (split &&
|
6138
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[id],
|
6499
|
+
// add event for the main device to wait on until other device is done
|
6500
|
+
if (split && (id != g_main_device || is != 0)) {
|
6501
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
6139
6502
|
}
|
6140
6503
|
}
|
6141
6504
|
}
|
6142
6505
|
}
|
6143
6506
|
|
6144
|
-
|
6145
|
-
|
6146
|
-
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
6147
|
-
continue;
|
6148
|
-
}
|
6149
|
-
|
6150
|
-
CUDA_CHECK(cudaSetDevice(id));
|
6507
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6508
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6151
6509
|
|
6152
|
-
|
6153
|
-
|
6154
|
-
|
6155
|
-
if (src0_asf[id] > 0) {
|
6156
|
-
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
6510
|
+
// free buffers again when done
|
6511
|
+
if (src0_as[id] > 0) {
|
6512
|
+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
6157
6513
|
}
|
6158
6514
|
if (src1_asf[id] > 0) {
|
6159
6515
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
6160
6516
|
}
|
6161
|
-
if (
|
6162
|
-
ggml_cuda_pool_free(
|
6517
|
+
if (src1_asq[id] > 0) {
|
6518
|
+
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
6519
|
+
}
|
6520
|
+
if (dst_as[id] > 0) {
|
6521
|
+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
6163
6522
|
}
|
6164
6523
|
}
|
6165
6524
|
|
6166
6525
|
// main device waits for all other devices to be finished
|
6167
6526
|
if (split && g_device_count > 1) {
|
6168
|
-
|
6169
|
-
|
6170
|
-
|
6171
|
-
|
6527
|
+
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
6528
|
+
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
6529
|
+
|
6530
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
|
+
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6172
6534
|
}
|
6173
6535
|
}
|
6174
6536
|
}
|
6175
6537
|
|
6176
6538
|
if (dst->backend == GGML_BACKEND_CPU) {
|
6177
|
-
CUDA_CHECK(
|
6539
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6178
6540
|
CUDA_CHECK(cudaDeviceSynchronize());
|
6179
6541
|
}
|
6180
6542
|
}
|
6181
6543
|
|
6182
6544
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6183
|
-
|
6184
|
-
// Due to flatten_rows == true this does in practice not make a difference however.
|
6185
|
-
// Better solution would be nice but right now that would require disproportionate changes.
|
6186
|
-
GGML_ASSERT(
|
6187
|
-
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
6188
|
-
src1->type == GGML_TYPE_F32 &&
|
6189
|
-
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
6190
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
6545
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6191
6546
|
}
|
6192
6547
|
|
6193
6548
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6194
|
-
|
6195
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
6549
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6196
6550
|
}
|
6197
6551
|
|
6198
6552
|
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6199
|
-
|
6200
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
6553
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6201
6554
|
}
|
6202
6555
|
|
6203
6556
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6204
|
-
|
6205
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
6557
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6206
6558
|
}
|
6207
6559
|
|
6208
6560
|
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6209
|
-
|
6210
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
6561
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6211
6562
|
}
|
6212
6563
|
|
6213
6564
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6214
|
-
|
6215
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
6565
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6216
6566
|
}
|
6217
6567
|
|
6218
6568
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -6246,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6246
6596
|
|
6247
6597
|
const int64_t ne12 = src1->ne[2];
|
6248
6598
|
|
6249
|
-
CUDA_CHECK(
|
6250
|
-
cudaStream_t
|
6599
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6600
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6251
6601
|
|
6252
6602
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6253
6603
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6258,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6258
6608
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6259
6609
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6260
6610
|
|
6261
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12,
|
6611
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6262
6612
|
}
|
6263
6613
|
|
6264
6614
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -6277,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6277
6627
|
const int64_t nb01 = src0->nb[1];
|
6278
6628
|
const int64_t nb02 = src0->nb[2];
|
6279
6629
|
|
6280
|
-
CUDA_CHECK(
|
6281
|
-
cudaStream_t
|
6630
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6631
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6282
6632
|
|
6283
6633
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6284
6634
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6289,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6289
6639
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6290
6640
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6291
6641
|
|
6292
|
-
const
|
6293
|
-
const
|
6642
|
+
const int64_t row_stride_x = nb01 / sizeof(half);
|
6643
|
+
const int64_t channel_stride_x = nb02 / sizeof(half);
|
6294
6644
|
|
6295
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,
|
6645
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6296
6646
|
}
|
6297
6647
|
|
6298
6648
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6299
6649
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6300
6650
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6301
6651
|
|
6652
|
+
int64_t min_compute_capability = INT_MAX;
|
6653
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6654
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6655
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6656
|
+
min_compute_capability = g_compute_capabilities[id];
|
6657
|
+
}
|
6658
|
+
}
|
6659
|
+
|
6302
6660
|
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6303
6661
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6304
6662
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6305
6663
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6306
6664
|
}else if (src0->type == GGML_TYPE_F32) {
|
6307
|
-
|
6665
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6308
6666
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6309
6667
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
6310
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
6311
|
-
} else {
|
6312
|
-
int min_compute_capability = INT_MAX;
|
6313
|
-
for (int id = 0; id < g_device_count; ++id) {
|
6314
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6315
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6316
|
-
min_compute_capability = g_compute_capabilities[id];
|
6317
|
-
}
|
6318
|
-
}
|
6319
6668
|
|
6669
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
6670
|
+
const bool use_mul_mat_vec_q = false;
|
6671
|
+
#else
|
6672
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
6673
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
6674
|
+
|
6675
|
+
if (use_mul_mat_vec_q) {
|
6676
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
6677
|
+
} else {
|
6678
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
6679
|
+
}
|
6680
|
+
} else {
|
6320
6681
|
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
6321
|
-
|
6682
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
6322
6683
|
} else {
|
6323
|
-
|
6684
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6324
6685
|
}
|
6325
6686
|
}
|
6326
6687
|
} else {
|
@@ -6329,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6329
6690
|
}
|
6330
6691
|
|
6331
6692
|
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6332
|
-
|
6333
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
6693
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6334
6694
|
}
|
6335
6695
|
|
6336
6696
|
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6359,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6359
6719
|
const int64_t nb11 = src1->nb[1];
|
6360
6720
|
const int64_t nb12 = src1->nb[2];
|
6361
6721
|
|
6362
|
-
CUDA_CHECK(
|
6363
|
-
cudaStream_t
|
6722
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6723
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6364
6724
|
|
6365
6725
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6366
6726
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -6370,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6370
6730
|
|
6371
6731
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
6372
6732
|
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6373
|
-
ne10, ne11, nb10, nb11, nb12,
|
6733
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6374
6734
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
6375
6735
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6376
|
-
ne10, ne11, nb10, nb11, nb12,
|
6736
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6377
6737
|
} else {
|
6378
6738
|
GGML_ASSERT(false);
|
6379
6739
|
}
|
@@ -6387,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6387
6747
|
}
|
6388
6748
|
|
6389
6749
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6390
|
-
|
6391
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
6750
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6392
6751
|
}
|
6393
6752
|
|
6394
6753
|
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6395
|
-
|
6396
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
6754
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6397
6755
|
}
|
6398
6756
|
|
6399
6757
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6400
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6401
6758
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6402
|
-
|
6403
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
6404
|
-
const bool is_glm = mode & 4;
|
6405
|
-
|
6406
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6759
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6407
6760
|
}
|
6408
6761
|
|
6409
6762
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6410
|
-
|
6411
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6763
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6412
6764
|
}
|
6413
6765
|
|
6414
6766
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6418,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6418
6770
|
}
|
6419
6771
|
|
6420
6772
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
6421
|
-
|
6773
|
+
const int64_t nrows = ggml_nrows(tensor);
|
6422
6774
|
|
6423
6775
|
const int64_t ne0 = tensor->ne[0];
|
6424
6776
|
|
@@ -6428,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6428
6780
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6429
6781
|
memset(extra, 0, sizeof(*extra));
|
6430
6782
|
|
6431
|
-
for (
|
6783
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6432
6784
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
6433
6785
|
continue;
|
6434
6786
|
}
|
6435
6787
|
|
6436
|
-
|
6788
|
+
ggml_cuda_set_device(id);
|
6437
6789
|
|
6438
|
-
|
6790
|
+
int64_t row_low, row_high;
|
6439
6791
|
if (backend == GGML_BACKEND_GPU) {
|
6440
6792
|
row_low = 0;
|
6441
6793
|
row_high = nrows;
|
@@ -6485,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6485
6837
|
extra->data_device[id] = buf;
|
6486
6838
|
|
6487
6839
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6488
|
-
|
6840
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6841
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
6842
|
+
}
|
6489
6843
|
}
|
6490
6844
|
}
|
6491
6845
|
|
@@ -6499,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
6499
6853
|
|
6500
6854
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6501
6855
|
|
6502
|
-
for (
|
6856
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6503
6857
|
if (extra->data_device[id] != nullptr) {
|
6504
|
-
CUDA_CHECK(
|
6858
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6505
6859
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
6506
6860
|
}
|
6507
6861
|
|
6508
|
-
|
6509
|
-
|
6510
|
-
|
6862
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6863
|
+
if (extra->events[id][is] != nullptr) {
|
6864
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6865
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
6866
|
+
}
|
6511
6867
|
}
|
6512
6868
|
}
|
6513
6869
|
|
@@ -6559,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6559
6915
|
force_inplace;
|
6560
6916
|
const size_t size = ggml_nbytes(tensor);
|
6561
6917
|
|
6562
|
-
CUDA_CHECK(
|
6918
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6563
6919
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6564
6920
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6565
6921
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|