llama_cpp 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -13,7 +13,7 @@
|
|
13
13
|
#ifdef __HIP_PLATFORM_AMD__
|
14
14
|
// for rocblas_initialize()
|
15
15
|
#include "rocblas/rocblas.h"
|
16
|
-
#endif
|
16
|
+
#endif // __HIP_PLATFORM_AMD__
|
17
17
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
18
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
19
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
@@ -68,25 +68,52 @@
|
|
68
68
|
#include <cuda_runtime.h>
|
69
69
|
#include <cublas_v2.h>
|
70
70
|
#include <cuda_fp16.h>
|
71
|
-
#endif
|
71
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
72
72
|
|
73
73
|
#include "ggml-cuda.h"
|
74
74
|
#include "ggml.h"
|
75
75
|
|
76
|
-
#define MIN_CC_DP4A
|
77
|
-
#
|
78
|
-
#define
|
79
|
-
#
|
76
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#define CC_TURING 700
|
78
|
+
#define CC_OFFSET_AMD 1000000
|
79
|
+
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
80
80
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
85
|
+
defined(__gfx1150__) || defined(__gfx1151__)
|
86
|
+
#define RDNA3
|
87
|
+
#endif
|
88
|
+
|
89
|
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
90
|
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
91
|
+
#define RDNA2
|
92
|
+
#endif
|
93
|
+
|
94
|
+
#ifndef __has_builtin
|
95
|
+
#define __has_builtin(x) 0
|
96
|
+
#endif
|
97
|
+
|
84
98
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
99
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
100
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
101
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
102
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
103
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
104
|
return reinterpret_cast<const int&>(c);
|
105
|
+
#else
|
106
|
+
int8x4_t c;
|
107
|
+
int16_t tmp;
|
108
|
+
#pragma unroll
|
109
|
+
for (int i = 0; i < 4; i++) {
|
110
|
+
tmp = va[i] - vb[i];
|
111
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
112
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
113
|
+
c[i] = tmp;
|
114
|
+
}
|
115
|
+
return reinterpret_cast<int&>(c);
|
116
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
117
|
}
|
91
118
|
|
92
119
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -115,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
115
142
|
#endif
|
116
143
|
return c;
|
117
144
|
}
|
118
|
-
#endif
|
145
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
119
146
|
|
120
147
|
#if defined(_MSC_VER)
|
121
148
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -127,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
127
154
|
do { \
|
128
155
|
cudaError_t err_ = (err); \
|
129
156
|
if (err_ != cudaSuccess) { \
|
130
|
-
|
157
|
+
int id; \
|
158
|
+
cudaGetDevice(&id); \
|
159
|
+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
131
160
|
cudaGetErrorString(err_)); \
|
161
|
+
fprintf(stderr, "current device: %d\n", id); \
|
132
162
|
exit(1); \
|
133
163
|
} \
|
134
164
|
} while (0)
|
@@ -138,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
138
168
|
do { \
|
139
169
|
cublasStatus_t err_ = (err); \
|
140
170
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
171
|
+
int id; \
|
172
|
+
cudaGetDevice(&id); \
|
141
173
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
142
174
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
175
|
+
fprintf(stderr, "current device: %d\n", id); \
|
143
176
|
exit(1); \
|
144
177
|
} \
|
145
178
|
} while (0)
|
@@ -148,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
148
181
|
do { \
|
149
182
|
cublasStatus_t err_ = (err); \
|
150
183
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
184
|
+
int id; \
|
185
|
+
cudaGetDevice(&id); \
|
151
186
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
187
|
+
fprintf(stderr, "current device: %d\n", id); \
|
152
188
|
exit(1); \
|
153
189
|
} \
|
154
190
|
} while (0)
|
@@ -195,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
|
|
195
231
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
196
232
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
197
233
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
198
|
-
typedef void (*
|
199
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
200
|
-
|
201
|
-
cudaStream_t &
|
234
|
+
typedef void (*ggml_cuda_op_mul_mat_t)(
|
235
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
236
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
237
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream);
|
238
|
+
typedef void (*ggml_cuda_op_flatten_t)(
|
239
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
240
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
|
202
241
|
|
203
242
|
// QK = number of values after dequantization
|
204
243
|
// QR = QK / number of values before dequantization
|
@@ -379,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
379
418
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
380
419
|
#endif
|
381
420
|
|
421
|
+
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
|
+
|
423
|
+
#define MAX_STREAMS 8
|
424
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
425
|
+
|
382
426
|
struct ggml_tensor_extra_gpu {
|
383
427
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
384
|
-
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
428
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
|
385
429
|
};
|
386
430
|
|
431
|
+
// this is faster on Windows
|
432
|
+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
433
|
+
inline cudaError_t ggml_cuda_set_device(const int device) {
|
434
|
+
int current_device;
|
435
|
+
CUDA_CHECK(cudaGetDevice(¤t_device));
|
436
|
+
|
437
|
+
if (device == current_device) {
|
438
|
+
return cudaSuccess;
|
439
|
+
}
|
440
|
+
|
441
|
+
return cudaSetDevice(device);
|
442
|
+
}
|
443
|
+
|
387
444
|
static int g_device_count = -1;
|
388
445
|
static int g_main_device = 0;
|
389
446
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
@@ -396,8 +453,6 @@ static size_t g_scratch_offset = 0;
|
|
396
453
|
|
397
454
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
398
455
|
|
399
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
400
|
-
|
401
456
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
402
457
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
403
458
|
|
@@ -447,58 +502,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
502
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
503
|
}
|
449
504
|
|
505
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
509
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
510
|
+
}
|
511
|
+
return a;
|
512
|
+
}
|
513
|
+
|
514
|
+
template <int block_size>
|
450
515
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
516
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
517
|
const int tid = threadIdx.x;
|
453
518
|
|
454
519
|
const float eps = 1e-5f;
|
455
520
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
521
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
522
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
523
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
524
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
525
|
+
mean_var.x += xi;
|
526
|
+
mean_var.y += xi * xi;
|
463
527
|
}
|
464
528
|
|
465
529
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
530
|
+
mean_var = warp_reduce_sum(mean_var);
|
531
|
+
if (block_size > WARP_SIZE) {
|
532
|
+
__shared__ float2 s_sum[32];
|
533
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
534
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
535
|
+
if (lane_id == 0) {
|
536
|
+
s_sum[warp_id] = mean_var;
|
537
|
+
}
|
538
|
+
__syncthreads();
|
539
|
+
mean_var = s_sum[lane_id];
|
540
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
541
|
}
|
471
542
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
543
|
+
const float mean = mean_var.x / ncols;
|
544
|
+
const float var = mean_var.y / ncols - mean * mean;
|
545
|
+
const float inv_std = rsqrtf(var + eps);
|
475
546
|
|
476
|
-
for (int col = tid; col < ncols; col +=
|
477
|
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) *
|
547
|
+
for (int col = tid; col < ncols; col += block_size) {
|
548
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
553
|
+
#pragma unroll
|
554
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
555
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
556
|
}
|
557
|
+
return x;
|
479
558
|
}
|
480
559
|
|
560
|
+
template <int block_size>
|
481
561
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
562
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
563
|
const int tid = threadIdx.x;
|
484
564
|
|
485
565
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
566
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
567
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
568
|
const float xi = x[row*ncols + col];
|
489
569
|
tmp += xi * xi;
|
490
570
|
}
|
491
571
|
|
492
572
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
573
|
+
tmp = warp_reduce_sum(tmp);
|
574
|
+
if (block_size > WARP_SIZE) {
|
575
|
+
__shared__ float s_sum[32];
|
576
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
577
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
578
|
+
if (lane_id == 0) {
|
579
|
+
s_sum[warp_id] = tmp;
|
580
|
+
}
|
581
|
+
__syncthreads();
|
582
|
+
tmp = s_sum[lane_id];
|
583
|
+
tmp = warp_reduce_sum(tmp);
|
496
584
|
}
|
497
585
|
|
498
586
|
const float mean = tmp / ncols;
|
499
587
|
const float scale = rsqrtf(mean + eps);
|
500
588
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
589
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
590
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
591
|
}
|
504
592
|
}
|
@@ -3394,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3394
3482
|
}
|
3395
3483
|
}
|
3396
3484
|
|
3485
|
+
#define MMQ_X_Q4_0_RDNA2 64
|
3486
|
+
#define MMQ_Y_Q4_0_RDNA2 128
|
3487
|
+
#define NWARPS_Q4_0_RDNA2 8
|
3488
|
+
#define MMQ_X_Q4_0_RDNA1 64
|
3489
|
+
#define MMQ_Y_Q4_0_RDNA1 64
|
3490
|
+
#define NWARPS_Q4_0_RDNA1 8
|
3397
3491
|
#define MMQ_X_Q4_0_AMPERE 64
|
3398
3492
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3399
3493
|
#define NWARPS_Q4_0_AMPERE 4
|
@@ -3401,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3401
3495
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3402
3496
|
#define NWARPS_Q4_0_PASCAL 8
|
3403
3497
|
|
3404
|
-
template <bool need_check> static __global__ void
|
3498
|
+
template <bool need_check> static __global__ void
|
3499
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3500
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3501
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
3502
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3503
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3504
|
+
mul_mat_q4_0(
|
3405
3505
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3406
3506
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3407
3507
|
|
3408
|
-
#if
|
3508
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3509
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3510
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
3511
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
3512
|
+
const int nwarps = NWARPS_Q4_0_RDNA2;
|
3513
|
+
#else
|
3514
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
3515
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
3516
|
+
const int nwarps = NWARPS_Q4_0_RDNA1;
|
3517
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3518
|
+
|
3519
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3520
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3521
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3522
|
+
|
3523
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3409
3524
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3410
3525
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3411
3526
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3428,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3428
3543
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3429
3544
|
}
|
3430
3545
|
|
3546
|
+
#define MMQ_X_Q4_1_RDNA2 64
|
3547
|
+
#define MMQ_Y_Q4_1_RDNA2 128
|
3548
|
+
#define NWARPS_Q4_1_RDNA2 8
|
3549
|
+
#define MMQ_X_Q4_1_RDNA1 64
|
3550
|
+
#define MMQ_Y_Q4_1_RDNA1 64
|
3551
|
+
#define NWARPS_Q4_1_RDNA1 8
|
3431
3552
|
#define MMQ_X_Q4_1_AMPERE 64
|
3432
3553
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3433
3554
|
#define NWARPS_Q4_1_AMPERE 4
|
@@ -3436,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3436
3557
|
#define NWARPS_Q4_1_PASCAL 8
|
3437
3558
|
|
3438
3559
|
template <bool need_check> static __global__ void
|
3439
|
-
#if
|
3560
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3561
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3562
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3563
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3564
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3440
3565
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3441
3566
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3442
3567
|
mul_mat_q4_1(
|
3443
3568
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3444
3569
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3445
3570
|
|
3446
|
-
#if
|
3571
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3572
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3573
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
3574
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
3575
|
+
const int nwarps = NWARPS_Q4_1_RDNA2;
|
3576
|
+
#else
|
3577
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
3578
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
3579
|
+
const int nwarps = NWARPS_Q4_1_RDNA1;
|
3580
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3581
|
+
|
3582
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3583
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3584
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3585
|
+
|
3586
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3447
3587
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3448
3588
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3449
3589
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3466,6 +3606,12 @@ template <bool need_check> static __global__ void
|
|
3466
3606
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3467
3607
|
}
|
3468
3608
|
|
3609
|
+
#define MMQ_X_Q5_0_RDNA2 64
|
3610
|
+
#define MMQ_Y_Q5_0_RDNA2 128
|
3611
|
+
#define NWARPS_Q5_0_RDNA2 8
|
3612
|
+
#define MMQ_X_Q5_0_RDNA1 64
|
3613
|
+
#define MMQ_Y_Q5_0_RDNA1 64
|
3614
|
+
#define NWARPS_Q5_0_RDNA1 8
|
3469
3615
|
#define MMQ_X_Q5_0_AMPERE 128
|
3470
3616
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3471
3617
|
#define NWARPS_Q5_0_AMPERE 4
|
@@ -3473,11 +3619,32 @@ template <bool need_check> static __global__ void
|
|
3473
3619
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3474
3620
|
#define NWARPS_Q5_0_PASCAL 8
|
3475
3621
|
|
3476
|
-
template <bool need_check> static __global__ void
|
3622
|
+
template <bool need_check> static __global__ void
|
3623
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3624
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3625
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
3626
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3627
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3628
|
+
mul_mat_q5_0(
|
3477
3629
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3478
3630
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3479
3631
|
|
3480
|
-
#if
|
3632
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3633
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3634
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
3635
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
3636
|
+
const int nwarps = NWARPS_Q5_0_RDNA2;
|
3637
|
+
#else
|
3638
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
3639
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
3640
|
+
const int nwarps = NWARPS_Q5_0_RDNA1;
|
3641
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3642
|
+
|
3643
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3644
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3645
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3646
|
+
|
3647
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3481
3648
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3482
3649
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3483
3650
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3500,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3500
3667
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3501
3668
|
}
|
3502
3669
|
|
3670
|
+
#define MMQ_X_Q5_1_RDNA2 64
|
3671
|
+
#define MMQ_Y_Q5_1_RDNA2 128
|
3672
|
+
#define NWARPS_Q5_1_RDNA2 8
|
3673
|
+
#define MMQ_X_Q5_1_RDNA1 64
|
3674
|
+
#define MMQ_Y_Q5_1_RDNA1 64
|
3675
|
+
#define NWARPS_Q5_1_RDNA1 8
|
3503
3676
|
#define MMQ_X_Q5_1_AMPERE 128
|
3504
3677
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3505
3678
|
#define NWARPS_Q5_1_AMPERE 4
|
@@ -3507,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3507
3680
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3508
3681
|
#define NWARPS_Q5_1_PASCAL 8
|
3509
3682
|
|
3510
|
-
template <bool need_check> static __global__ void
|
3683
|
+
template <bool need_check> static __global__ void
|
3684
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3685
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3686
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
3687
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3688
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3689
|
+
mul_mat_q5_1(
|
3511
3690
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3512
3691
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3513
3692
|
|
3514
|
-
#if
|
3693
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3694
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3695
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
3696
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
3697
|
+
const int nwarps = NWARPS_Q5_1_RDNA2;
|
3698
|
+
#else
|
3699
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
3700
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
3701
|
+
const int nwarps = NWARPS_Q5_1_RDNA1;
|
3702
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3703
|
+
|
3704
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3705
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3706
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3707
|
+
|
3708
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3515
3709
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3516
3710
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3517
3711
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3534,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3534
3728
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3535
3729
|
}
|
3536
3730
|
|
3731
|
+
#define MMQ_X_Q8_0_RDNA2 64
|
3732
|
+
#define MMQ_Y_Q8_0_RDNA2 128
|
3733
|
+
#define NWARPS_Q8_0_RDNA2 8
|
3734
|
+
#define MMQ_X_Q8_0_RDNA1 64
|
3735
|
+
#define MMQ_Y_Q8_0_RDNA1 64
|
3736
|
+
#define NWARPS_Q8_0_RDNA1 8
|
3537
3737
|
#define MMQ_X_Q8_0_AMPERE 128
|
3538
3738
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3539
3739
|
#define NWARPS_Q8_0_AMPERE 4
|
@@ -3541,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3541
3741
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3542
3742
|
#define NWARPS_Q8_0_PASCAL 8
|
3543
3743
|
|
3544
|
-
template <bool need_check> static __global__ void
|
3744
|
+
template <bool need_check> static __global__ void
|
3745
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3746
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3747
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
3748
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3749
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3750
|
+
mul_mat_q8_0(
|
3545
3751
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3546
3752
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3547
3753
|
|
3548
|
-
#if
|
3754
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3755
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3756
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
3757
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
3758
|
+
const int nwarps = NWARPS_Q8_0_RDNA2;
|
3759
|
+
#else
|
3760
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
3761
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
3762
|
+
const int nwarps = NWARPS_Q8_0_RDNA1;
|
3763
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3764
|
+
|
3765
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3766
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3767
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3768
|
+
|
3769
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3549
3770
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3550
3771
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3551
3772
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3568,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3568
3789
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3569
3790
|
}
|
3570
3791
|
|
3792
|
+
#define MMQ_X_Q2_K_RDNA2 64
|
3793
|
+
#define MMQ_Y_Q2_K_RDNA2 128
|
3794
|
+
#define NWARPS_Q2_K_RDNA2 8
|
3795
|
+
#define MMQ_X_Q2_K_RDNA1 128
|
3796
|
+
#define MMQ_Y_Q2_K_RDNA1 32
|
3797
|
+
#define NWARPS_Q2_K_RDNA1 8
|
3571
3798
|
#define MMQ_X_Q2_K_AMPERE 64
|
3572
3799
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3573
3800
|
#define NWARPS_Q2_K_AMPERE 4
|
@@ -3575,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3575
3802
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3576
3803
|
#define NWARPS_Q2_K_PASCAL 8
|
3577
3804
|
|
3578
|
-
template <bool need_check> static __global__ void
|
3805
|
+
template <bool need_check> static __global__ void
|
3806
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3807
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3808
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
3809
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3810
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3811
|
+
mul_mat_q2_K(
|
3579
3812
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3580
3813
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3581
3814
|
|
3582
|
-
#if
|
3815
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3816
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3817
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
3818
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
3819
|
+
const int nwarps = NWARPS_Q2_K_RDNA2;
|
3820
|
+
#else
|
3821
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
3822
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
3823
|
+
const int nwarps = NWARPS_Q2_K_RDNA1;
|
3824
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3825
|
+
|
3826
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3827
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3828
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3829
|
+
|
3830
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3583
3831
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3584
3832
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3585
3833
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3602,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3602
3850
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3603
3851
|
}
|
3604
3852
|
|
3853
|
+
#define MMQ_X_Q3_K_RDNA2 128
|
3854
|
+
#define MMQ_Y_Q3_K_RDNA2 64
|
3855
|
+
#define NWARPS_Q3_K_RDNA2 8
|
3856
|
+
#define MMQ_X_Q3_K_RDNA1 32
|
3857
|
+
#define MMQ_Y_Q3_K_RDNA1 128
|
3858
|
+
#define NWARPS_Q3_K_RDNA1 8
|
3605
3859
|
#define MMQ_X_Q3_K_AMPERE 128
|
3606
3860
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3607
3861
|
#define NWARPS_Q3_K_AMPERE 4
|
@@ -3610,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3610
3864
|
#define NWARPS_Q3_K_PASCAL 8
|
3611
3865
|
|
3612
3866
|
template <bool need_check> static __global__ void
|
3613
|
-
#if
|
3867
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3868
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3869
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3870
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3871
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3614
3872
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3615
3873
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3616
3874
|
mul_mat_q3_K(
|
3617
3875
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3618
3876
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3619
3877
|
|
3620
|
-
#if
|
3878
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3879
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3880
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
3881
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
3882
|
+
const int nwarps = NWARPS_Q3_K_RDNA2;
|
3883
|
+
#else
|
3884
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
3885
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
3886
|
+
const int nwarps = NWARPS_Q3_K_RDNA1;
|
3887
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3888
|
+
|
3889
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3890
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3891
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3892
|
+
|
3893
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3621
3894
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3622
3895
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3623
3896
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3640,6 +3913,12 @@ template <bool need_check> static __global__ void
|
|
3640
3913
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3641
3914
|
}
|
3642
3915
|
|
3916
|
+
#define MMQ_X_Q4_K_RDNA2 64
|
3917
|
+
#define MMQ_Y_Q4_K_RDNA2 128
|
3918
|
+
#define NWARPS_Q4_K_RDNA2 8
|
3919
|
+
#define MMQ_X_Q4_K_RDNA1 32
|
3920
|
+
#define MMQ_Y_Q4_K_RDNA1 64
|
3921
|
+
#define NWARPS_Q4_K_RDNA1 8
|
3643
3922
|
#define MMQ_X_Q4_K_AMPERE 64
|
3644
3923
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3645
3924
|
#define NWARPS_Q4_K_AMPERE 4
|
@@ -3648,14 +3927,33 @@ template <bool need_check> static __global__ void
|
|
3648
3927
|
#define NWARPS_Q4_K_PASCAL 8
|
3649
3928
|
|
3650
3929
|
template <bool need_check> static __global__ void
|
3651
|
-
#if
|
3930
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3931
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3932
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3933
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3934
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3652
3935
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3653
3936
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3654
3937
|
mul_mat_q4_K(
|
3655
3938
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3656
3939
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3657
3940
|
|
3658
|
-
#if
|
3941
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3942
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3943
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
3944
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
3945
|
+
const int nwarps = NWARPS_Q4_K_RDNA2;
|
3946
|
+
#else
|
3947
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
3948
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
3949
|
+
const int nwarps = NWARPS_Q4_K_RDNA1;
|
3950
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3951
|
+
|
3952
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3953
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3954
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3955
|
+
|
3956
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3659
3957
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3660
3958
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3661
3959
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3678,6 +3976,12 @@ template <bool need_check> static __global__ void
|
|
3678
3976
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3679
3977
|
}
|
3680
3978
|
|
3979
|
+
#define MMQ_X_Q5_K_RDNA2 64
|
3980
|
+
#define MMQ_Y_Q5_K_RDNA2 128
|
3981
|
+
#define NWARPS_Q5_K_RDNA2 8
|
3982
|
+
#define MMQ_X_Q5_K_RDNA1 32
|
3983
|
+
#define MMQ_Y_Q5_K_RDNA1 64
|
3984
|
+
#define NWARPS_Q5_K_RDNA1 8
|
3681
3985
|
#define MMQ_X_Q5_K_AMPERE 64
|
3682
3986
|
#define MMQ_Y_Q5_K_AMPERE 128
|
3683
3987
|
#define NWARPS_Q5_K_AMPERE 4
|
@@ -3685,11 +3989,32 @@ template <bool need_check> static __global__ void
|
|
3685
3989
|
#define MMQ_Y_Q5_K_PASCAL 64
|
3686
3990
|
#define NWARPS_Q5_K_PASCAL 8
|
3687
3991
|
|
3688
|
-
template <bool need_check> static __global__ void
|
3992
|
+
template <bool need_check> static __global__ void
|
3993
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3994
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3995
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
3996
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3997
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3998
|
+
mul_mat_q5_K(
|
3689
3999
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3690
4000
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3691
4001
|
|
3692
|
-
#if
|
4002
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4003
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4004
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
4005
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
4006
|
+
const int nwarps = NWARPS_Q5_K_RDNA2;
|
4007
|
+
#else
|
4008
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
4009
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
4010
|
+
const int nwarps = NWARPS_Q5_K_RDNA1;
|
4011
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4012
|
+
|
4013
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4014
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4015
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4016
|
+
|
4017
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3693
4018
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3694
4019
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3695
4020
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -3712,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3712
4037
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3713
4038
|
}
|
3714
4039
|
|
4040
|
+
#define MMQ_X_Q6_K_RDNA2 64
|
4041
|
+
#define MMQ_Y_Q6_K_RDNA2 128
|
4042
|
+
#define NWARPS_Q6_K_RDNA2 8
|
4043
|
+
#define MMQ_X_Q6_K_RDNA1 32
|
4044
|
+
#define MMQ_Y_Q6_K_RDNA1 64
|
4045
|
+
#define NWARPS_Q6_K_RDNA1 8
|
3715
4046
|
#define MMQ_X_Q6_K_AMPERE 64
|
3716
4047
|
#define MMQ_Y_Q6_K_AMPERE 64
|
3717
4048
|
#define NWARPS_Q6_K_AMPERE 4
|
@@ -3720,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3720
4051
|
#define NWARPS_Q6_K_PASCAL 8
|
3721
4052
|
|
3722
4053
|
template <bool need_check> static __global__ void
|
3723
|
-
#if
|
4054
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4055
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4056
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4057
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4058
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3724
4059
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3725
4060
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3726
4061
|
mul_mat_q6_K(
|
3727
4062
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3728
4063
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3729
4064
|
|
3730
|
-
#if
|
4065
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4066
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4067
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
4068
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
4069
|
+
const int nwarps = NWARPS_Q6_K_RDNA2;
|
4070
|
+
#else
|
4071
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
4072
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
4073
|
+
const int nwarps = NWARPS_Q6_K_RDNA1;
|
4074
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4075
|
+
|
4076
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4077
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4078
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4079
|
+
|
4080
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3731
4081
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3732
4082
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3733
4083
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4036,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4036
4386
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4037
4387
|
}
|
4038
4388
|
|
4039
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4389
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
4390
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4040
4391
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4041
4392
|
const int half_n_dims = ncols/4;
|
4042
4393
|
|
@@ -4048,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4048
4399
|
const int i = row*ncols + col;
|
4049
4400
|
|
4050
4401
|
const float col_theta_scale = powf(theta_scale, col);
|
4402
|
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
4051
4403
|
|
4052
|
-
const float theta = p*col_theta_scale;
|
4404
|
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
4053
4405
|
const float sin_theta = sinf(theta);
|
4054
4406
|
const float cos_theta = cosf(theta);
|
4055
4407
|
|
@@ -4059,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4059
4411
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4060
4412
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4061
4413
|
|
4062
|
-
const float block_theta =
|
4414
|
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
4063
4415
|
const float sin_block_theta = sinf(block_theta);
|
4064
4416
|
const float cos_block_theta = cosf(block_theta);
|
4065
4417
|
|
@@ -4186,14 +4538,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4186
4538
|
|
4187
4539
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4188
4540
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4541
|
+
if (ncols < 1024) {
|
4542
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4543
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4544
|
+
} else {
|
4545
|
+
const dim3 block_dims(1024, 1, 1);
|
4546
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4547
|
+
}
|
4191
4548
|
}
|
4192
4549
|
|
4193
4550
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4194
4551
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4195
|
-
|
4196
|
-
|
4552
|
+
if (ncols < 1024) {
|
4553
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4554
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4555
|
+
} else {
|
4556
|
+
const dim3 block_dims(1024, 1, 1);
|
4557
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4558
|
+
}
|
4197
4559
|
}
|
4198
4560
|
|
4199
4561
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -4498,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4498
4860
|
const int compute_capability = g_compute_capabilities[id];
|
4499
4861
|
|
4500
4862
|
int mmq_x, mmq_y, nwarps;
|
4501
|
-
if (compute_capability >=
|
4863
|
+
if (compute_capability >= CC_RDNA2) {
|
4864
|
+
mmq_x = MMQ_X_Q4_0_RDNA2;
|
4865
|
+
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
4866
|
+
nwarps = NWARPS_Q4_0_RDNA2;
|
4867
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4868
|
+
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4869
|
+
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4870
|
+
nwarps = NWARPS_Q4_0_RDNA1;
|
4871
|
+
} else if (compute_capability >= CC_TURING) {
|
4502
4872
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4503
4873
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4504
4874
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4535,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4535
4905
|
const int compute_capability = g_compute_capabilities[id];
|
4536
4906
|
|
4537
4907
|
int mmq_x, mmq_y, nwarps;
|
4538
|
-
if (compute_capability >=
|
4908
|
+
if (compute_capability >= CC_RDNA2) {
|
4909
|
+
mmq_x = MMQ_X_Q4_1_RDNA2;
|
4910
|
+
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
4911
|
+
nwarps = NWARPS_Q4_1_RDNA2;
|
4912
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4913
|
+
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4914
|
+
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4915
|
+
nwarps = NWARPS_Q4_1_RDNA1;
|
4916
|
+
} else if (compute_capability >= CC_TURING) {
|
4539
4917
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4540
4918
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4541
4919
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4572,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4572
4950
|
const int compute_capability = g_compute_capabilities[id];
|
4573
4951
|
|
4574
4952
|
int mmq_x, mmq_y, nwarps;
|
4575
|
-
if (compute_capability >=
|
4953
|
+
if (compute_capability >= CC_RDNA2) {
|
4954
|
+
mmq_x = MMQ_X_Q5_0_RDNA2;
|
4955
|
+
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
4956
|
+
nwarps = NWARPS_Q5_0_RDNA2;
|
4957
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4958
|
+
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4959
|
+
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4960
|
+
nwarps = NWARPS_Q5_0_RDNA1;
|
4961
|
+
} else if (compute_capability >= CC_TURING) {
|
4576
4962
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4577
4963
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4578
4964
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -4609,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4609
4995
|
const int compute_capability = g_compute_capabilities[id];
|
4610
4996
|
|
4611
4997
|
int mmq_x, mmq_y, nwarps;
|
4612
|
-
if (compute_capability >=
|
4998
|
+
if (compute_capability >= CC_RDNA2) {
|
4999
|
+
mmq_x = MMQ_X_Q5_1_RDNA2;
|
5000
|
+
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
5001
|
+
nwarps = NWARPS_Q5_1_RDNA2;
|
5002
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5003
|
+
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5004
|
+
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5005
|
+
nwarps = NWARPS_Q5_1_RDNA1;
|
5006
|
+
} else if (compute_capability >= CC_TURING) {
|
4613
5007
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4614
5008
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4615
5009
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -4646,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4646
5040
|
const int compute_capability = g_compute_capabilities[id];
|
4647
5041
|
|
4648
5042
|
int mmq_x, mmq_y, nwarps;
|
4649
|
-
if (compute_capability >=
|
5043
|
+
if (compute_capability >= CC_RDNA2) {
|
5044
|
+
mmq_x = MMQ_X_Q8_0_RDNA2;
|
5045
|
+
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
5046
|
+
nwarps = NWARPS_Q8_0_RDNA2;
|
5047
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5048
|
+
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5049
|
+
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5050
|
+
nwarps = NWARPS_Q8_0_RDNA1;
|
5051
|
+
} else if (compute_capability >= CC_TURING) {
|
4650
5052
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4651
5053
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4652
5054
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -4683,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4683
5085
|
const int compute_capability = g_compute_capabilities[id];
|
4684
5086
|
|
4685
5087
|
int mmq_x, mmq_y, nwarps;
|
4686
|
-
if (compute_capability >=
|
5088
|
+
if (compute_capability >= CC_RDNA2) {
|
5089
|
+
mmq_x = MMQ_X_Q2_K_RDNA2;
|
5090
|
+
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
5091
|
+
nwarps = NWARPS_Q2_K_RDNA2;
|
5092
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5093
|
+
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5094
|
+
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5095
|
+
nwarps = NWARPS_Q2_K_RDNA1;
|
5096
|
+
} else if (compute_capability >= CC_TURING) {
|
4687
5097
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4688
5098
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4689
5099
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -4722,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4722
5132
|
const int compute_capability = g_compute_capabilities[id];
|
4723
5133
|
|
4724
5134
|
int mmq_x, mmq_y, nwarps;
|
4725
|
-
if (compute_capability >=
|
5135
|
+
if (compute_capability >= CC_RDNA2) {
|
5136
|
+
mmq_x = MMQ_X_Q3_K_RDNA2;
|
5137
|
+
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
5138
|
+
nwarps = NWARPS_Q3_K_RDNA2;
|
5139
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5140
|
+
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5141
|
+
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5142
|
+
nwarps = NWARPS_Q3_K_RDNA1;
|
5143
|
+
} else if (compute_capability >= CC_TURING) {
|
4726
5144
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4727
5145
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4728
5146
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -4760,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4760
5178
|
const int compute_capability = g_compute_capabilities[id];
|
4761
5179
|
|
4762
5180
|
int mmq_x, mmq_y, nwarps;
|
4763
|
-
if (compute_capability >=
|
5181
|
+
if (compute_capability >= CC_RDNA2) {
|
5182
|
+
mmq_x = MMQ_X_Q4_K_RDNA2;
|
5183
|
+
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
5184
|
+
nwarps = NWARPS_Q4_K_RDNA2;
|
5185
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5186
|
+
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5187
|
+
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5188
|
+
nwarps = NWARPS_Q4_K_RDNA1;
|
5189
|
+
} else if (compute_capability >= CC_TURING) {
|
4764
5190
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4765
5191
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4766
5192
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4797,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4797
5223
|
const int compute_capability = g_compute_capabilities[id];
|
4798
5224
|
|
4799
5225
|
int mmq_x, mmq_y, nwarps;
|
4800
|
-
if (compute_capability >=
|
5226
|
+
if (compute_capability >= CC_RDNA2) {
|
5227
|
+
mmq_x = MMQ_X_Q5_K_RDNA2;
|
5228
|
+
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
5229
|
+
nwarps = NWARPS_Q5_K_RDNA2;
|
5230
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5231
|
+
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5232
|
+
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5233
|
+
nwarps = NWARPS_Q5_K_RDNA1;
|
5234
|
+
} else if (compute_capability >= CC_TURING) {
|
4801
5235
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4802
5236
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4803
5237
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4834,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4834
5268
|
const int compute_capability = g_compute_capabilities[id];
|
4835
5269
|
|
4836
5270
|
int mmq_x, mmq_y, nwarps;
|
4837
|
-
if (compute_capability >=
|
5271
|
+
if (compute_capability >= CC_RDNA2) {
|
5272
|
+
mmq_x = MMQ_X_Q6_K_RDNA2;
|
5273
|
+
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
5274
|
+
nwarps = NWARPS_Q6_K_RDNA2;
|
5275
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5276
|
+
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5277
|
+
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5278
|
+
nwarps = NWARPS_Q6_K_RDNA1;
|
5279
|
+
} else if (compute_capability >= CC_TURING) {
|
4838
5280
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4839
5281
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4840
5282
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4924,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
4924
5366
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4925
5367
|
}
|
4926
5368
|
|
4927
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4928
|
-
|
4929
|
-
|
4930
|
-
const
|
5369
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
5370
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5371
|
+
GGML_ASSERT(ncols % 4 == 0);
|
5372
|
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5373
|
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
4931
5374
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4932
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5375
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
4933
5376
|
}
|
4934
5377
|
|
4935
5378
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5067,25 +5510,30 @@ void ggml_init_cublas() {
|
|
5067
5510
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5068
5511
|
int64_t total_vram = 0;
|
5069
5512
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5070
|
-
for (
|
5513
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5071
5514
|
cudaDeviceProp prop;
|
5072
5515
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5073
|
-
fprintf(stderr, " Device %
|
5516
|
+
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5074
5517
|
|
5075
5518
|
g_tensor_split[id] = total_vram;
|
5076
5519
|
total_vram += prop.totalGlobalMem;
|
5077
|
-
|
5520
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5521
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
5522
|
+
#else
|
5078
5523
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5524
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5079
5525
|
}
|
5080
|
-
for (
|
5526
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5081
5527
|
g_tensor_split[id] /= total_vram;
|
5082
5528
|
}
|
5083
5529
|
|
5084
|
-
for (
|
5085
|
-
CUDA_CHECK(
|
5530
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5531
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
5086
5532
|
|
5087
|
-
// create
|
5088
|
-
|
5533
|
+
// create cuda streams
|
5534
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
5535
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5536
|
+
}
|
5089
5537
|
|
5090
5538
|
// create cublas handle
|
5091
5539
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -5154,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5154
5602
|
if (src->backend == GGML_BACKEND_CPU) {
|
5155
5603
|
kind = cudaMemcpyHostToDevice;
|
5156
5604
|
src_ptr = (char *) src->data;
|
5157
|
-
} else if (src->backend == GGML_BACKEND_GPU) {
|
5605
|
+
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5606
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5158
5607
|
kind = cudaMemcpyDeviceToDevice;
|
5159
5608
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5160
5609
|
int id;
|
@@ -5193,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5193
5642
|
}
|
5194
5643
|
|
5195
5644
|
inline void ggml_cuda_op_add(
|
5196
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5197
|
-
float *
|
5198
|
-
cudaStream_t & cudaStream_main){
|
5199
|
-
|
5200
|
-
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
5201
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5202
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5645
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5646
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5203
5647
|
|
5204
|
-
|
5205
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5648
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5206
5649
|
|
5207
5650
|
const int64_t ne10 = src1->ne[0];
|
5208
5651
|
const int64_t ne11 = src1->ne[1];
|
5209
5652
|
|
5210
|
-
// compute
|
5211
5653
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
5212
|
-
add_f32_cuda(
|
5654
|
+
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5213
5655
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5214
|
-
add_f16_f32_f16_cuda((half *)
|
5656
|
+
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
5215
5657
|
} else {
|
5216
5658
|
GGML_ASSERT(false);
|
5217
5659
|
}
|
5218
5660
|
|
5219
5661
|
(void) src1;
|
5220
5662
|
(void) dst;
|
5221
|
-
(void) src0_ddq_i;
|
5222
|
-
(void) i02;
|
5223
|
-
(void) i1;
|
5224
5663
|
}
|
5225
5664
|
|
5226
5665
|
inline void ggml_cuda_op_mul(
|
5227
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5228
|
-
float *
|
5229
|
-
cudaStream_t & cudaStream_main){
|
5230
|
-
|
5231
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5232
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5233
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5666
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5667
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5234
5668
|
|
5235
|
-
|
5236
|
-
|
5669
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5670
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5671
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5237
5672
|
|
5238
5673
|
const int64_t ne10 = src1->ne[0];
|
5239
5674
|
const int64_t ne11 = src1->ne[1];
|
5240
5675
|
|
5241
|
-
mul_f32_cuda(
|
5676
|
+
mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5242
5677
|
|
5243
5678
|
(void) dst;
|
5244
|
-
(void) src0_ddq_i;
|
5245
|
-
(void) i02;
|
5246
|
-
(void) i1;
|
5247
5679
|
}
|
5248
5680
|
|
5249
5681
|
inline void ggml_cuda_op_gelu(
|
5250
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5251
|
-
float *
|
5252
|
-
cudaStream_t & cudaStream_main){
|
5253
|
-
|
5254
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5255
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5682
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5683
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5256
5684
|
|
5257
|
-
|
5258
|
-
|
5685
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5686
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5259
5687
|
|
5260
|
-
|
5261
|
-
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5688
|
+
gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5262
5689
|
|
5263
5690
|
(void) src1;
|
5264
5691
|
(void) dst;
|
5265
|
-
(void)
|
5266
|
-
(void) src1_ddf_i;
|
5267
|
-
(void) i02;
|
5268
|
-
(void) i1;
|
5692
|
+
(void) src1_dd;
|
5269
5693
|
}
|
5270
5694
|
|
5271
5695
|
inline void ggml_cuda_op_silu(
|
5272
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5273
|
-
float *
|
5274
|
-
cudaStream_t & cudaStream_main){
|
5696
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5697
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5275
5698
|
|
5276
|
-
GGML_ASSERT(
|
5277
|
-
GGML_ASSERT(
|
5699
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5700
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5278
5701
|
|
5279
|
-
|
5280
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5281
|
-
|
5282
|
-
// compute
|
5283
|
-
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5702
|
+
silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5284
5703
|
|
5285
5704
|
(void) src1;
|
5286
5705
|
(void) dst;
|
5287
|
-
(void)
|
5288
|
-
(void) src1_ddf_i;
|
5289
|
-
(void) i02;
|
5290
|
-
(void) i1;
|
5706
|
+
(void) src1_dd;
|
5291
5707
|
}
|
5292
5708
|
|
5293
5709
|
inline void ggml_cuda_op_norm(
|
5294
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5295
|
-
float *
|
5296
|
-
cudaStream_t & cudaStream_main){
|
5710
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5711
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5297
5712
|
|
5298
|
-
GGML_ASSERT(
|
5299
|
-
GGML_ASSERT(
|
5713
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5714
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5300
5715
|
|
5301
5716
|
const int64_t ne00 = src0->ne[0];
|
5302
|
-
const int64_t
|
5717
|
+
const int64_t nrows = ggml_nrows(src0);
|
5303
5718
|
|
5304
|
-
|
5305
|
-
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
5719
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5306
5720
|
|
5307
5721
|
(void) src1;
|
5308
5722
|
(void) dst;
|
5309
|
-
(void)
|
5310
|
-
(void) src1_ddf_i;
|
5311
|
-
(void) i02;
|
5312
|
-
(void) i1;
|
5723
|
+
(void) src1_dd;
|
5313
5724
|
}
|
5314
5725
|
|
5315
5726
|
inline void ggml_cuda_op_rms_norm(
|
5316
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5317
|
-
float *
|
5318
|
-
cudaStream_t & cudaStream_main){
|
5727
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5728
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5319
5729
|
|
5320
|
-
GGML_ASSERT(
|
5321
|
-
GGML_ASSERT(
|
5730
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5731
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5322
5732
|
|
5323
5733
|
const int64_t ne00 = src0->ne[0];
|
5324
|
-
const int64_t
|
5734
|
+
const int64_t nrows = ggml_nrows(src0);
|
5325
5735
|
|
5326
5736
|
float eps;
|
5327
5737
|
memcpy(&eps, dst->op_params, sizeof(float));
|
5328
5738
|
|
5329
|
-
|
5330
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
5739
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
5331
5740
|
|
5332
5741
|
(void) src1;
|
5333
5742
|
(void) dst;
|
5334
|
-
(void)
|
5335
|
-
(void) src1_ddf_i;
|
5336
|
-
(void) i02;
|
5337
|
-
(void) i1;
|
5743
|
+
(void) src1_dd;
|
5338
5744
|
}
|
5339
5745
|
|
5340
5746
|
inline void ggml_cuda_op_mul_mat_q(
|
5341
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5342
|
-
|
5343
|
-
cudaStream_t &
|
5344
|
-
|
5345
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5346
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5347
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5747
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5748
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5749
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5348
5750
|
|
5349
5751
|
const int64_t ne00 = src0->ne[0];
|
5350
5752
|
|
5351
5753
|
const int64_t ne10 = src1->ne[0];
|
5352
|
-
const int64_t ne11 = src1->ne[1];
|
5353
5754
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
5354
5755
|
|
5355
5756
|
const int64_t ne0 = dst->ne[0];
|
5356
5757
|
|
5357
|
-
const int64_t
|
5758
|
+
const int64_t row_diff = row_high - row_low;
|
5358
5759
|
|
5359
5760
|
int id;
|
5360
5761
|
CUDA_CHECK(cudaGetDevice(&id));
|
5361
5762
|
|
5362
5763
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5363
5764
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
5364
|
-
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
5365
|
-
|
5366
|
-
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
5367
|
-
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5368
|
-
size_t as;
|
5369
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
5370
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
5765
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5371
5766
|
|
5372
5767
|
switch (src0->type) {
|
5373
5768
|
case GGML_TYPE_Q4_0:
|
5374
|
-
ggml_mul_mat_q4_0_q8_1_cuda(
|
5769
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5375
5770
|
break;
|
5376
5771
|
case GGML_TYPE_Q4_1:
|
5377
|
-
ggml_mul_mat_q4_1_q8_1_cuda(
|
5772
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5378
5773
|
break;
|
5379
5774
|
case GGML_TYPE_Q5_0:
|
5380
|
-
ggml_mul_mat_q5_0_q8_1_cuda(
|
5775
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5381
5776
|
break;
|
5382
5777
|
case GGML_TYPE_Q5_1:
|
5383
|
-
ggml_mul_mat_q5_1_q8_1_cuda(
|
5778
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5384
5779
|
break;
|
5385
5780
|
case GGML_TYPE_Q8_0:
|
5386
|
-
ggml_mul_mat_q8_0_q8_1_cuda(
|
5781
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5387
5782
|
break;
|
5388
5783
|
case GGML_TYPE_Q2_K:
|
5389
|
-
ggml_mul_mat_q2_K_q8_1_cuda(
|
5784
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5390
5785
|
break;
|
5391
5786
|
case GGML_TYPE_Q3_K:
|
5392
|
-
ggml_mul_mat_q3_K_q8_1_cuda(
|
5787
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5393
5788
|
break;
|
5394
5789
|
case GGML_TYPE_Q4_K:
|
5395
|
-
ggml_mul_mat_q4_K_q8_1_cuda(
|
5790
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5396
5791
|
break;
|
5397
5792
|
case GGML_TYPE_Q5_K:
|
5398
|
-
ggml_mul_mat_q5_K_q8_1_cuda(
|
5793
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5399
5794
|
break;
|
5400
5795
|
case GGML_TYPE_Q6_K:
|
5401
|
-
ggml_mul_mat_q6_K_q8_1_cuda(
|
5796
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5402
5797
|
break;
|
5403
5798
|
default:
|
5404
5799
|
GGML_ASSERT(false);
|
5405
5800
|
break;
|
5406
5801
|
}
|
5407
5802
|
|
5408
|
-
ggml_cuda_pool_free(src1_q8_1, as);
|
5409
|
-
|
5410
5803
|
(void) src1;
|
5411
5804
|
(void) dst;
|
5412
|
-
(void)
|
5413
|
-
(void) i02;
|
5414
|
-
(void) i1;
|
5805
|
+
(void) src1_ddf_i;
|
5415
5806
|
}
|
5416
5807
|
|
5417
5808
|
static int64_t get_row_rounding(ggml_type type) {
|
5418
|
-
|
5419
|
-
|
5420
|
-
|
5421
|
-
|
5422
|
-
|
5809
|
+
int64_t min_compute_capability = INT_MAX;
|
5810
|
+
int64_t max_compute_capability = INT_MIN;
|
5811
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5812
|
+
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5813
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5814
|
+
min_compute_capability = g_compute_capabilities[id];
|
5815
|
+
}
|
5816
|
+
if (max_compute_capability < g_compute_capabilities[id]) {
|
5817
|
+
max_compute_capability = g_compute_capabilities[id];
|
5818
|
+
}
|
5423
5819
|
}
|
5424
5820
|
}
|
5425
5821
|
|
5822
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5823
|
+
switch(type) {
|
5824
|
+
case GGML_TYPE_Q4_0:
|
5825
|
+
case GGML_TYPE_Q4_1:
|
5826
|
+
case GGML_TYPE_Q5_0:
|
5827
|
+
case GGML_TYPE_Q5_1:
|
5828
|
+
case GGML_TYPE_Q8_0:
|
5829
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5830
|
+
case GGML_TYPE_F16:
|
5831
|
+
return 1;
|
5832
|
+
case GGML_TYPE_Q2_K:
|
5833
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
5834
|
+
case GGML_TYPE_Q3_K:
|
5835
|
+
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
5836
|
+
case GGML_TYPE_Q4_K:
|
5837
|
+
case GGML_TYPE_Q5_K:
|
5838
|
+
case GGML_TYPE_Q6_K:
|
5839
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5840
|
+
default:
|
5841
|
+
GGML_ASSERT(false);
|
5842
|
+
}
|
5843
|
+
#else
|
5426
5844
|
switch(type) {
|
5427
5845
|
case GGML_TYPE_Q4_0:
|
5428
5846
|
case GGML_TYPE_Q4_1:
|
@@ -5443,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5443
5861
|
default:
|
5444
5862
|
GGML_ASSERT(false);
|
5445
5863
|
}
|
5864
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5446
5865
|
}
|
5447
5866
|
|
5448
|
-
inline void
|
5449
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5450
|
-
|
5451
|
-
cudaStream_t &
|
5452
|
-
|
5453
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5454
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5455
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5867
|
+
inline void ggml_cuda_op_mul_mat_vec_q(
|
5868
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5869
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5870
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5456
5871
|
|
5457
5872
|
const int64_t ne00 = src0->ne[0];
|
5458
|
-
const int64_t
|
5873
|
+
const int64_t row_diff = row_high - row_low;
|
5459
5874
|
|
5460
|
-
|
5461
|
-
|
5462
|
-
|
5463
|
-
|
5464
|
-
|
5465
|
-
|
5875
|
+
switch (src0->type) {
|
5876
|
+
case GGML_TYPE_Q4_0:
|
5877
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5878
|
+
break;
|
5879
|
+
case GGML_TYPE_Q4_1:
|
5880
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5881
|
+
break;
|
5882
|
+
case GGML_TYPE_Q5_0:
|
5883
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5884
|
+
break;
|
5885
|
+
case GGML_TYPE_Q5_1:
|
5886
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5887
|
+
break;
|
5888
|
+
case GGML_TYPE_Q8_0:
|
5889
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5890
|
+
break;
|
5891
|
+
case GGML_TYPE_Q2_K:
|
5892
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5893
|
+
break;
|
5894
|
+
case GGML_TYPE_Q3_K:
|
5895
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5896
|
+
break;
|
5897
|
+
case GGML_TYPE_Q4_K:
|
5898
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5899
|
+
break;
|
5900
|
+
case GGML_TYPE_Q5_K:
|
5901
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5902
|
+
break;
|
5903
|
+
case GGML_TYPE_Q6_K:
|
5904
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5905
|
+
break;
|
5906
|
+
default:
|
5907
|
+
GGML_ASSERT(false);
|
5908
|
+
break;
|
5909
|
+
}
|
5466
5910
|
|
5467
|
-
|
5468
|
-
|
5469
|
-
|
5470
|
-
|
5471
|
-
|
5472
|
-
|
5473
|
-
#if QK_K == 256
|
5474
|
-
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
5475
|
-
src0->type == GGML_TYPE_Q2_K ||
|
5476
|
-
src0->type == GGML_TYPE_Q3_K ||
|
5477
|
-
src0->type == GGML_TYPE_Q4_K ||
|
5478
|
-
src0->type == GGML_TYPE_Q5_K ||
|
5479
|
-
src0->type == GGML_TYPE_Q6_K;
|
5480
|
-
#endif // QK_K == 256
|
5481
|
-
|
5482
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
5483
|
-
#endif
|
5911
|
+
(void) src1;
|
5912
|
+
(void) dst;
|
5913
|
+
(void) src1_ddf_i;
|
5914
|
+
(void) src1_ncols;
|
5915
|
+
(void) src1_padded_row_size;
|
5916
|
+
}
|
5484
5917
|
|
5485
|
-
|
5486
|
-
|
5487
|
-
|
5488
|
-
|
5489
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
5490
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
5491
|
-
|
5492
|
-
switch (src0->type) {
|
5493
|
-
case GGML_TYPE_Q4_0:
|
5494
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5495
|
-
break;
|
5496
|
-
case GGML_TYPE_Q4_1:
|
5497
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5498
|
-
break;
|
5499
|
-
case GGML_TYPE_Q5_0:
|
5500
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5501
|
-
break;
|
5502
|
-
case GGML_TYPE_Q5_1:
|
5503
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5504
|
-
break;
|
5505
|
-
case GGML_TYPE_Q8_0:
|
5506
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5507
|
-
break;
|
5508
|
-
case GGML_TYPE_Q2_K:
|
5509
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5510
|
-
break;
|
5511
|
-
case GGML_TYPE_Q3_K:
|
5512
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5513
|
-
break;
|
5514
|
-
case GGML_TYPE_Q4_K:
|
5515
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5516
|
-
break;
|
5517
|
-
case GGML_TYPE_Q5_K:
|
5518
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5519
|
-
break;
|
5520
|
-
case GGML_TYPE_Q6_K:
|
5521
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5522
|
-
break;
|
5523
|
-
default:
|
5524
|
-
GGML_ASSERT(false);
|
5525
|
-
break;
|
5526
|
-
}
|
5918
|
+
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
5919
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5920
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5921
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5527
5922
|
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5923
|
+
const int64_t ne00 = src0->ne[0];
|
5924
|
+
const int64_t row_diff = row_high - row_low;
|
5925
|
+
|
5926
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
5531
5927
|
#ifdef GGML_CUDA_F16
|
5532
|
-
|
5533
|
-
|
5534
|
-
|
5535
|
-
|
5536
|
-
|
5537
|
-
|
5538
|
-
|
5539
|
-
|
5540
|
-
|
5541
|
-
|
5542
|
-
|
5543
|
-
|
5544
|
-
|
5928
|
+
size_t ash;
|
5929
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
5930
|
+
|
5931
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
5932
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
5933
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
5934
|
+
|
5935
|
+
if (src1_convert_f16) {
|
5936
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
5937
|
+
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
5938
|
+
ne00, 1, sizeof(float), 0, 0,
|
5939
|
+
ne00, 1, sizeof(half), 0, 0, stream);
|
5940
|
+
}
|
5545
5941
|
#else
|
5546
|
-
|
5942
|
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
5547
5943
|
#endif // GGML_CUDA_F16
|
5548
5944
|
|
5549
|
-
|
5550
|
-
|
5551
|
-
|
5552
|
-
|
5553
|
-
|
5554
|
-
|
5555
|
-
|
5556
|
-
|
5557
|
-
|
5558
|
-
|
5559
|
-
|
5560
|
-
|
5561
|
-
|
5562
|
-
|
5563
|
-
|
5564
|
-
|
5565
|
-
|
5566
|
-
|
5567
|
-
|
5568
|
-
|
5569
|
-
|
5570
|
-
|
5571
|
-
|
5572
|
-
|
5573
|
-
|
5574
|
-
|
5575
|
-
|
5576
|
-
|
5577
|
-
|
5578
|
-
|
5579
|
-
|
5580
|
-
|
5581
|
-
|
5582
|
-
|
5583
|
-
|
5584
|
-
|
5585
|
-
|
5586
|
-
|
5945
|
+
switch (src0->type) {
|
5946
|
+
case GGML_TYPE_Q4_0:
|
5947
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5948
|
+
break;
|
5949
|
+
case GGML_TYPE_Q4_1:
|
5950
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5951
|
+
break;
|
5952
|
+
case GGML_TYPE_Q5_0:
|
5953
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5954
|
+
break;
|
5955
|
+
case GGML_TYPE_Q5_1:
|
5956
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5957
|
+
break;
|
5958
|
+
case GGML_TYPE_Q8_0:
|
5959
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5960
|
+
break;
|
5961
|
+
case GGML_TYPE_Q2_K:
|
5962
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5963
|
+
break;
|
5964
|
+
case GGML_TYPE_Q3_K:
|
5965
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5966
|
+
break;
|
5967
|
+
case GGML_TYPE_Q4_K:
|
5968
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5969
|
+
break;
|
5970
|
+
case GGML_TYPE_Q5_K:
|
5971
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5972
|
+
break;
|
5973
|
+
case GGML_TYPE_Q6_K:
|
5974
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5975
|
+
break;
|
5976
|
+
case GGML_TYPE_F16:
|
5977
|
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5978
|
+
break;
|
5979
|
+
default:
|
5980
|
+
GGML_ASSERT(false);
|
5981
|
+
break;
|
5982
|
+
}
|
5587
5983
|
|
5588
5984
|
#ifdef GGML_CUDA_F16
|
5589
|
-
|
5590
|
-
|
5591
|
-
}
|
5592
|
-
#endif // GGML_CUDA_F16
|
5985
|
+
if (src1_convert_f16) {
|
5986
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
5593
5987
|
}
|
5988
|
+
#endif // GGML_CUDA_F16
|
5594
5989
|
|
5595
5990
|
(void) src1;
|
5596
5991
|
(void) dst;
|
5597
|
-
(void)
|
5598
|
-
(void)
|
5599
|
-
(void)
|
5992
|
+
(void) src1_ddq_i;
|
5993
|
+
(void) src1_ncols;
|
5994
|
+
(void) src1_padded_row_size;
|
5600
5995
|
}
|
5601
5996
|
|
5602
5997
|
inline void ggml_cuda_op_mul_mat_cublas(
|
5603
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5604
|
-
|
5605
|
-
cudaStream_t &
|
5998
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5999
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6000
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5606
6001
|
|
5607
|
-
GGML_ASSERT(
|
6002
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
5608
6003
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
5609
|
-
GGML_ASSERT(
|
6004
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
5610
6005
|
|
5611
6006
|
const float alpha = 1.0f;
|
5612
6007
|
const float beta = 0.0f;
|
@@ -5614,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
5614
6009
|
const int64_t ne00 = src0->ne[0];
|
5615
6010
|
|
5616
6011
|
const int64_t ne10 = src1->ne[0];
|
5617
|
-
const int64_t ne11 = src1->ne[1];
|
5618
6012
|
|
5619
6013
|
const int64_t ne0 = dst->ne[0];
|
5620
|
-
const int64_t
|
6014
|
+
const int64_t row_diff = row_high - row_low;
|
6015
|
+
|
6016
|
+
float * src0_ddq_as_f32;
|
6017
|
+
size_t src0_as = 0;
|
6018
|
+
|
6019
|
+
if (src0->type != GGML_TYPE_F32) {
|
6020
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6021
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6022
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6023
|
+
}
|
6024
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
5621
6025
|
|
5622
6026
|
int id;
|
5623
6027
|
CUDA_CHECK(cudaGetDevice(&id));
|
5624
6028
|
|
5625
6029
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5626
6030
|
// ldc == nrows of the matrix that cuBLAS writes into
|
5627
|
-
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
6031
|
+
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5628
6032
|
|
5629
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id],
|
6033
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
5630
6034
|
CUBLAS_CHECK(
|
5631
6035
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
5632
|
-
|
6036
|
+
row_diff, src1_ncols, ne10,
|
5633
6037
|
&alpha, src0_ddf_i, ne00,
|
5634
|
-
src1_ddf_i,
|
5635
|
-
&beta,
|
6038
|
+
src1_ddf_i, ne10,
|
6039
|
+
&beta, dst_dd_i, ldc));
|
6040
|
+
|
6041
|
+
if (src0_as > 0) {
|
6042
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6043
|
+
}
|
5636
6044
|
|
5637
6045
|
(void) dst;
|
5638
|
-
(void)
|
5639
|
-
(void)
|
5640
|
-
(void) i1;
|
6046
|
+
(void) src1_ddq_i;
|
6047
|
+
(void) src1_padded_row_size;
|
5641
6048
|
}
|
5642
6049
|
|
5643
6050
|
inline void ggml_cuda_op_rope(
|
5644
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5645
|
-
float *
|
5646
|
-
cudaStream_t & cudaStream_main){
|
6051
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6052
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5647
6053
|
|
5648
|
-
GGML_ASSERT(
|
5649
|
-
GGML_ASSERT(
|
6054
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6055
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5650
6056
|
|
5651
6057
|
const int64_t ne00 = src0->ne[0];
|
5652
6058
|
const int64_t ne01 = src0->ne[1];
|
5653
|
-
const int64_t
|
6059
|
+
const int64_t nrows = ggml_nrows(src0);
|
5654
6060
|
|
5655
6061
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5656
6062
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
@@ -5663,44 +6069,37 @@ inline void ggml_cuda_op_rope(
|
|
5663
6069
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
5664
6070
|
|
5665
6071
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6072
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5666
6073
|
|
5667
6074
|
const bool is_neox = mode & 2;
|
5668
6075
|
const bool is_glm = mode & 4;
|
5669
6076
|
|
5670
6077
|
// compute
|
5671
6078
|
if (is_glm) {
|
5672
|
-
|
5673
|
-
const float id_p = min(p, n_ctx - 2.f);
|
5674
|
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5675
|
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
6079
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
5676
6080
|
} else if (is_neox) {
|
5677
6081
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5678
|
-
|
5679
|
-
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6082
|
+
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5680
6083
|
} else {
|
5681
|
-
|
5682
|
-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6084
|
+
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5683
6085
|
}
|
5684
6086
|
|
5685
6087
|
(void) src1;
|
5686
6088
|
(void) dst;
|
5687
|
-
(void)
|
5688
|
-
(void) src1_ddf_i;
|
5689
|
-
(void) i1;
|
6089
|
+
(void) src1_dd;
|
5690
6090
|
}
|
5691
6091
|
|
5692
6092
|
inline void ggml_cuda_op_alibi(
|
5693
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5694
|
-
float *
|
5695
|
-
cudaStream_t & cudaStream_main){
|
6093
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6094
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5696
6095
|
|
5697
|
-
GGML_ASSERT(
|
5698
|
-
GGML_ASSERT(
|
6096
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6097
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5699
6098
|
|
5700
6099
|
const int64_t ne00 = src0->ne[0];
|
5701
6100
|
const int64_t ne01 = src0->ne[1];
|
5702
6101
|
const int64_t ne02 = src0->ne[2];
|
5703
|
-
const int64_t
|
6102
|
+
const int64_t nrows = ggml_nrows(src0);
|
5704
6103
|
|
5705
6104
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5706
6105
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
@@ -5715,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
|
|
5715
6114
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5716
6115
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5717
6116
|
|
5718
|
-
|
5719
|
-
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
6117
|
+
alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
5720
6118
|
|
5721
6119
|
(void) src1;
|
5722
|
-
(void)
|
5723
|
-
(void) src1_ddf_i;
|
5724
|
-
(void) i1;
|
6120
|
+
(void) src1_dd;
|
5725
6121
|
}
|
5726
6122
|
|
5727
6123
|
inline void ggml_cuda_op_diag_mask_inf(
|
5728
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5729
|
-
float *
|
5730
|
-
cudaStream_t & cudaStream_main){
|
6124
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6125
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5731
6126
|
|
5732
|
-
GGML_ASSERT(
|
5733
|
-
GGML_ASSERT(
|
6127
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6128
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5734
6129
|
|
5735
6130
|
const int64_t ne00 = src0->ne[0];
|
5736
6131
|
const int64_t ne01 = src0->ne[1];
|
5737
|
-
const
|
6132
|
+
const int nrows0 = ggml_nrows(src0);
|
5738
6133
|
|
5739
6134
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5740
6135
|
|
5741
|
-
|
5742
|
-
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
6136
|
+
diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
5743
6137
|
|
5744
6138
|
(void) src1;
|
5745
6139
|
(void) dst;
|
5746
|
-
(void)
|
5747
|
-
(void) src1_ddf_i;
|
5748
|
-
(void) i02;
|
5749
|
-
(void) i1;
|
6140
|
+
(void) src1_dd;
|
5750
6141
|
}
|
5751
6142
|
|
5752
6143
|
inline void ggml_cuda_op_soft_max(
|
5753
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5754
|
-
float *
|
5755
|
-
cudaStream_t & cudaStream_main){
|
6144
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6145
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5756
6146
|
|
5757
|
-
GGML_ASSERT(
|
5758
|
-
GGML_ASSERT(
|
6147
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6148
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5759
6149
|
|
5760
6150
|
const int64_t ne00 = src0->ne[0];
|
5761
|
-
const int64_t
|
6151
|
+
const int64_t nrows = ggml_nrows(src0);
|
5762
6152
|
|
5763
|
-
|
5764
|
-
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
6153
|
+
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5765
6154
|
|
5766
6155
|
(void) src1;
|
5767
6156
|
(void) dst;
|
5768
|
-
(void)
|
5769
|
-
(void) src1_ddf_i;
|
5770
|
-
(void) i02;
|
5771
|
-
(void) i1;
|
6157
|
+
(void) src1_dd;
|
5772
6158
|
}
|
5773
6159
|
|
5774
6160
|
inline void ggml_cuda_op_scale(
|
5775
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5776
|
-
float *
|
5777
|
-
cudaStream_t & cudaStream_main){
|
6161
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6162
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5778
6163
|
|
5779
|
-
GGML_ASSERT(
|
5780
|
-
GGML_ASSERT(
|
6164
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6165
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6166
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5781
6167
|
|
5782
6168
|
const float scale = ((float *) src1->data)[0];
|
5783
6169
|
|
5784
|
-
|
5785
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5786
|
-
|
5787
|
-
// compute
|
5788
|
-
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
6170
|
+
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
5789
6171
|
CUDA_CHECK(cudaGetLastError());
|
5790
6172
|
|
5791
6173
|
(void) src1;
|
5792
6174
|
(void) dst;
|
5793
|
-
(void)
|
5794
|
-
|
5795
|
-
|
5796
|
-
|
6175
|
+
(void) src1_dd;
|
6176
|
+
}
|
6177
|
+
|
6178
|
+
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6179
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
6180
|
+
|
6181
|
+
const bool use_src1 = src1 != nullptr;
|
6182
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6183
|
+
|
6184
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6185
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6186
|
+
|
6187
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6188
|
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6189
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6190
|
+
|
6191
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6192
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
6193
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
6194
|
+
|
6195
|
+
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
6196
|
+
|
6197
|
+
// dd = data device
|
6198
|
+
float * src0_ddf = nullptr;
|
6199
|
+
float * src1_ddf = nullptr;
|
6200
|
+
float * dst_ddf = nullptr;
|
6201
|
+
|
6202
|
+
// as = actual size
|
6203
|
+
size_t src0_asf = 0;
|
6204
|
+
size_t src1_asf = 0;
|
6205
|
+
size_t dst_asf = 0;
|
6206
|
+
|
6207
|
+
ggml_cuda_set_device(g_main_device);
|
6208
|
+
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6209
|
+
|
6210
|
+
if (src0_on_device) {
|
6211
|
+
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
6212
|
+
} else {
|
6213
|
+
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
6214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
6215
|
+
}
|
6216
|
+
|
6217
|
+
if (use_src1 && !src1_stays_on_host) {
|
6218
|
+
if (src1_on_device) {
|
6219
|
+
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6220
|
+
} else {
|
6221
|
+
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
6222
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
6223
|
+
}
|
6224
|
+
}
|
6225
|
+
if (dst_on_device) {
|
6226
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6227
|
+
} else {
|
6228
|
+
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
6229
|
+
}
|
6230
|
+
|
6231
|
+
// do the computation
|
6232
|
+
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
6233
|
+
CUDA_CHECK(cudaGetLastError());
|
6234
|
+
|
6235
|
+
// copy dst to host if necessary
|
6236
|
+
if (!dst_on_device) {
|
6237
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
6238
|
+
}
|
6239
|
+
|
6240
|
+
if (src0_asf > 0) {
|
6241
|
+
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
6242
|
+
}
|
6243
|
+
if (src1_asf > 0) {
|
6244
|
+
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
6245
|
+
}
|
6246
|
+
if (dst_asf > 0) {
|
6247
|
+
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
6251
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
6252
|
+
}
|
5797
6253
|
}
|
5798
6254
|
|
5799
|
-
static void
|
5800
|
-
|
6255
|
+
static void ggml_cuda_op_mul_mat(
|
6256
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
|
+
const bool convert_src1_to_q8_1) {
|
6258
|
+
|
5801
6259
|
const int64_t ne00 = src0->ne[0];
|
5802
6260
|
const int64_t ne01 = src0->ne[1];
|
5803
6261
|
const int64_t ne02 = src0->ne[2];
|
5804
6262
|
const int64_t ne03 = src0->ne[3];
|
5805
6263
|
const int64_t nrows0 = ggml_nrows(src0);
|
5806
6264
|
|
5807
|
-
const
|
5808
|
-
const int64_t
|
5809
|
-
const int64_t
|
5810
|
-
const int64_t
|
5811
|
-
const int64_t
|
5812
|
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6265
|
+
const int64_t ne10 = src1->ne[0];
|
6266
|
+
const int64_t ne11 = src1->ne[1];
|
6267
|
+
const int64_t ne12 = src1->ne[2];
|
6268
|
+
const int64_t ne13 = src1->ne[3];
|
6269
|
+
const int64_t nrows1 = ggml_nrows(src1);
|
5813
6270
|
|
5814
6271
|
GGML_ASSERT(ne03 == ne13);
|
5815
6272
|
|
5816
6273
|
const int64_t ne0 = dst->ne[0];
|
5817
6274
|
const int64_t ne1 = dst->ne[1];
|
5818
6275
|
|
5819
|
-
const int nb2
|
5820
|
-
const int nb3
|
6276
|
+
const int nb2 = dst->nb[2];
|
6277
|
+
const int nb3 = dst->nb[3];
|
5821
6278
|
|
5822
6279
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
5823
|
-
GGML_ASSERT(
|
6280
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
5824
6281
|
|
5825
|
-
|
5826
|
-
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
5827
|
-
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
5828
|
-
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
5829
|
-
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
5830
|
-
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
5831
|
-
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
6282
|
+
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
5832
6283
|
|
5833
|
-
const int64_t
|
5834
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
5835
|
-
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
5836
|
-
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
5837
|
-
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
6284
|
+
const int64_t i02_divisor = ne12 / ne02;
|
5838
6285
|
|
5839
6286
|
const size_t src0_ts = ggml_type_size(src0->type);
|
5840
6287
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
6288
|
+
const size_t q8_1_ts = sizeof(block_q8_1);
|
6289
|
+
const size_t q8_1_bs = QK8_1;
|
5841
6290
|
|
5842
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
5843
|
-
struct ggml_tensor_extra_gpu * src1_extra =
|
5844
|
-
struct ggml_tensor_extra_gpu *
|
6291
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6292
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6293
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
5845
6294
|
|
5846
6295
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
5847
6296
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
5848
|
-
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
5849
6297
|
|
5850
|
-
const bool src1_is_contiguous =
|
5851
|
-
const
|
5852
|
-
|
6298
|
+
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
6299
|
+
const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
6300
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5853
6301
|
|
5854
6302
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6303
|
+
GGML_ASSERT(!(split && ne02 > 1));
|
6304
|
+
GGML_ASSERT(!(split && ne03 > 1));
|
5855
6305
|
GGML_ASSERT(!(split && ne02 < ne12));
|
5856
6306
|
|
5857
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
5858
|
-
|
5859
6307
|
// dd = data device
|
5860
|
-
char *
|
5861
|
-
float *
|
5862
|
-
|
5863
|
-
float *
|
5864
|
-
|
5865
|
-
//
|
5866
|
-
size_t
|
5867
|
-
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6308
|
+
char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6309
|
+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
6310
|
+
char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
|
6311
|
+
float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6312
|
+
|
6313
|
+
// as = actual size
|
6314
|
+
size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5868
6315
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5869
|
-
size_t
|
6316
|
+
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
6317
|
+
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5870
6318
|
|
5871
|
-
|
5872
|
-
|
5873
|
-
if (split && g_device_count > 1) {
|
5874
|
-
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5875
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
5876
|
-
}
|
6319
|
+
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6320
|
+
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
5877
6321
|
|
5878
|
-
for (
|
5879
|
-
|
5880
|
-
|
5881
|
-
|
5882
|
-
|
5883
|
-
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5884
|
-
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6322
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6323
|
+
// by default, use all rows
|
6324
|
+
row_low[id] = 0;
|
6325
|
+
row_high[id] = ne01;
|
5885
6326
|
|
5886
|
-
|
6327
|
+
// for multi GPU, get the row boundaries from tensor split
|
6328
|
+
// and round to mul_mat_q tile sizes
|
5887
6329
|
if (split) {
|
5888
6330
|
const int64_t rounding = get_row_rounding(src0->type);
|
5889
6331
|
|
5890
|
-
|
5891
|
-
|
6332
|
+
if (id != 0) {
|
6333
|
+
row_low[id] = ne01*g_tensor_split[id];
|
6334
|
+
row_low[id] -= row_low[id] % rounding;
|
6335
|
+
}
|
5892
6336
|
|
5893
|
-
if (id
|
5894
|
-
row_high
|
5895
|
-
|
5896
|
-
row_high = nrows0*g_tensor_split[id + 1];
|
5897
|
-
row_high -= row_high % rounding;
|
6337
|
+
if (id != g_device_count - 1) {
|
6338
|
+
row_high[id] = ne01*g_tensor_split[id + 1];
|
6339
|
+
row_high[id] -= row_high[id] % rounding;
|
5898
6340
|
}
|
5899
|
-
} else {
|
5900
|
-
row_low = 0;
|
5901
|
-
row_high = nrows0*i02_divisor;
|
5902
6341
|
}
|
5903
|
-
|
6342
|
+
}
|
6343
|
+
|
6344
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6345
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
5904
6346
|
continue;
|
5905
6347
|
}
|
5906
6348
|
|
5907
|
-
|
5908
|
-
|
5909
|
-
cudaSetDevice(id);
|
5910
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
6349
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6350
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5911
6351
|
|
5912
|
-
|
5913
|
-
|
5914
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
5915
|
-
}
|
6352
|
+
ggml_cuda_set_device(id);
|
6353
|
+
const cudaStream_t stream = g_cudaStreams[id][0];
|
5916
6354
|
|
5917
6355
|
if (src0_on_device && src0_is_contiguous) {
|
5918
|
-
|
5919
|
-
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
5920
|
-
} else {
|
5921
|
-
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
5922
|
-
}
|
6356
|
+
src0_dd[id] = (char *) src0_extra->data_device[id];
|
5923
6357
|
} else {
|
5924
|
-
|
5925
|
-
|
5926
|
-
} else {
|
5927
|
-
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
5928
|
-
}
|
6358
|
+
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6359
|
+
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
5929
6360
|
}
|
5930
6361
|
|
5931
|
-
if (
|
5932
|
-
|
6362
|
+
if (src1_on_device && src1_is_contiguous) {
|
6363
|
+
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
6364
|
+
} else {
|
6365
|
+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
5933
6366
|
}
|
5934
6367
|
|
5935
|
-
if (
|
5936
|
-
|
5937
|
-
|
5938
|
-
|
5939
|
-
src1_ddf[id]
|
6368
|
+
if (convert_src1_to_q8_1) {
|
6369
|
+
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6370
|
+
|
6371
|
+
if (split && src1_on_device && src1_is_contiguous) {
|
6372
|
+
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6373
|
+
CUDA_CHECK(cudaGetLastError());
|
5940
6374
|
}
|
5941
6375
|
}
|
6376
|
+
|
5942
6377
|
if (dst_on_device) {
|
5943
|
-
|
6378
|
+
dst_dd[id] = (float *) dst_extra->data_device[id];
|
5944
6379
|
} else {
|
5945
|
-
size_t size_dst_ddf = split ?
|
5946
|
-
|
6380
|
+
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
6381
|
+
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
5947
6382
|
}
|
6383
|
+
}
|
5948
6384
|
|
5949
|
-
|
5950
|
-
|
5951
|
-
|
5952
|
-
|
6385
|
+
// if multiple devices are used they need to wait for the main device
|
6386
|
+
// here an event is recorded that signals that the main device has finished calculating the input data
|
6387
|
+
if (split && g_device_count > 1) {
|
6388
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6389
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6390
|
+
}
|
5953
6391
|
|
5954
|
-
|
6392
|
+
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6393
|
+
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6394
|
+
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6395
|
+
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
5955
6396
|
|
5956
|
-
|
5957
|
-
|
5958
|
-
|
6397
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6398
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
6399
|
+
continue;
|
6400
|
+
}
|
5959
6401
|
|
5960
|
-
|
5961
|
-
|
5962
|
-
|
5963
|
-
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
5964
|
-
continue;
|
5965
|
-
}
|
5966
|
-
if (i0 == i0_offset_low) {
|
5967
|
-
i01_low = row_low % rows_per_iter;
|
5968
|
-
}
|
5969
|
-
if (i0 == i0_offset_high) {
|
5970
|
-
i01_high = row_high % rows_per_iter;
|
5971
|
-
}
|
5972
|
-
}
|
6402
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6403
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6404
|
+
const int64_t row_diff = row_high[id] - row_low[id];
|
5973
6405
|
|
5974
|
-
|
5975
|
-
|
5976
|
-
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
5977
|
-
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
5978
|
-
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
5979
|
-
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
6406
|
+
ggml_cuda_set_device(id);
|
6407
|
+
const cudaStream_t stream = g_cudaStreams[id][is];
|
5980
6408
|
|
5981
|
-
|
5982
|
-
|
5983
|
-
|
5984
|
-
|
5985
|
-
|
6409
|
+
// wait for main GPU data if necessary
|
6410
|
+
if (split && (id != g_main_device || is != 0)) {
|
6411
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6412
|
+
}
|
6413
|
+
|
6414
|
+
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
6415
|
+
const int64_t i03 = i0 / ne12;
|
6416
|
+
const int64_t i02 = i0 % ne12;
|
6417
|
+
|
6418
|
+
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
5986
6419
|
|
5987
6420
|
// for split tensors the data begins at i0 == i0_offset_low
|
5988
|
-
char *
|
5989
|
-
float *
|
5990
|
-
|
5991
|
-
float *
|
5992
|
-
|
5993
|
-
// for split tensors the data pointer needs to be rounded down
|
5994
|
-
// to the bin edge for i03, i02 bins beyond the first
|
5995
|
-
if (i0 - i0_offset_low > 0) {
|
5996
|
-
GGML_ASSERT(!flatten_rows);
|
5997
|
-
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
5998
|
-
src0_ddf_i -= (row_low % ne01)*ne00;
|
5999
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
6000
|
-
}
|
6421
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
6422
|
+
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
6423
|
+
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
6424
|
+
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
6001
6425
|
|
6002
6426
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
6003
6427
|
// in that case an offset on dst_ddf_i is needed
|
6004
6428
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
6005
|
-
|
6429
|
+
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
6006
6430
|
}
|
6007
6431
|
|
6008
6432
|
// copy src0, src1 to device if necessary
|
6009
|
-
if (
|
6010
|
-
if (
|
6011
|
-
|
6012
|
-
|
6013
|
-
|
6014
|
-
|
6015
|
-
|
6016
|
-
GGML_ASSERT(!flatten_rows);
|
6433
|
+
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
6434
|
+
if (id != g_main_device) {
|
6435
|
+
if (convert_src1_to_q8_1) {
|
6436
|
+
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
6437
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
6438
|
+
cudaMemcpyDeviceToDevice, stream));
|
6439
|
+
} else {
|
6017
6440
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
6018
|
-
src1_ddf_i_source +=
|
6019
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source,
|
6020
|
-
cudaMemcpyDeviceToDevice,
|
6441
|
+
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
6442
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
6443
|
+
cudaMemcpyDeviceToDevice, stream));
|
6021
6444
|
}
|
6022
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
6023
|
-
GGML_ASSERT(!split);
|
6024
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
6025
|
-
} else {
|
6026
|
-
GGML_ASSERT(false);
|
6027
6445
|
}
|
6446
|
+
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
6447
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
6448
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
6449
|
+
} else {
|
6450
|
+
GGML_ASSERT(false);
|
6028
6451
|
}
|
6029
6452
|
|
6030
|
-
if (
|
6031
|
-
|
6032
|
-
|
6033
|
-
} else {
|
6034
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
6035
|
-
}
|
6453
|
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6454
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6455
|
+
CUDA_CHECK(cudaGetLastError());
|
6036
6456
|
}
|
6037
6457
|
|
6038
|
-
|
6039
|
-
|
6040
|
-
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
6041
|
-
CUDA_CHECK(cudaGetLastError());
|
6458
|
+
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
6459
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
6042
6460
|
}
|
6043
6461
|
|
6044
6462
|
// do the computation
|
6045
|
-
op(src0, src1, dst,
|
6463
|
+
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
6464
|
+
row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
|
6046
6465
|
CUDA_CHECK(cudaGetLastError());
|
6047
6466
|
|
6048
6467
|
// copy dst to host or other device if necessary
|
@@ -6064,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
6064
6483
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
6065
6484
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
6066
6485
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
6067
|
-
float * dhf_dst_i = (float *) ((char *) dst_off_device +
|
6068
|
-
|
6069
|
-
|
6486
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6487
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6488
|
+
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
6489
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
6490
|
+
row_diff*sizeof(float), src1_ncols, kind, stream));
|
6070
6491
|
} else {
|
6071
6492
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6072
|
-
|
6493
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6494
|
+
dhf_dst_i += src1_col_0*ne0;
|
6495
|
+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
6073
6496
|
}
|
6074
6497
|
}
|
6075
6498
|
|
6076
|
-
//
|
6077
|
-
if (split &&
|
6078
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[id],
|
6499
|
+
// add event for the main device to wait on until other device is done
|
6500
|
+
if (split && (id != g_main_device || is != 0)) {
|
6501
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
6079
6502
|
}
|
6080
6503
|
}
|
6081
6504
|
}
|
6082
6505
|
}
|
6083
6506
|
|
6084
|
-
|
6085
|
-
|
6086
|
-
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
6087
|
-
continue;
|
6088
|
-
}
|
6089
|
-
|
6090
|
-
CUDA_CHECK(cudaSetDevice(id));
|
6507
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6508
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6091
6509
|
|
6092
|
-
|
6093
|
-
|
6094
|
-
|
6095
|
-
if (src0_asf[id] > 0) {
|
6096
|
-
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
6510
|
+
// free buffers again when done
|
6511
|
+
if (src0_as[id] > 0) {
|
6512
|
+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
6097
6513
|
}
|
6098
6514
|
if (src1_asf[id] > 0) {
|
6099
6515
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
6100
6516
|
}
|
6101
|
-
if (
|
6102
|
-
ggml_cuda_pool_free(
|
6517
|
+
if (src1_asq[id] > 0) {
|
6518
|
+
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
6519
|
+
}
|
6520
|
+
if (dst_as[id] > 0) {
|
6521
|
+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
6103
6522
|
}
|
6104
6523
|
}
|
6105
6524
|
|
6106
6525
|
// main device waits for all other devices to be finished
|
6107
6526
|
if (split && g_device_count > 1) {
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
|
6527
|
+
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
6528
|
+
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
6529
|
+
|
6530
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
|
+
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6112
6534
|
}
|
6113
6535
|
}
|
6114
6536
|
}
|
6115
6537
|
|
6116
6538
|
if (dst->backend == GGML_BACKEND_CPU) {
|
6117
|
-
CUDA_CHECK(
|
6539
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6118
6540
|
CUDA_CHECK(cudaDeviceSynchronize());
|
6119
6541
|
}
|
6120
6542
|
}
|
6121
6543
|
|
6122
6544
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6123
|
-
|
6124
|
-
// Due to flatten_rows == true this does in practice not make a difference however.
|
6125
|
-
// Better solution would be nice but right now that would require disproportionate changes.
|
6126
|
-
GGML_ASSERT(
|
6127
|
-
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
6128
|
-
src1->type == GGML_TYPE_F32 &&
|
6129
|
-
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
6130
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
6545
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6131
6546
|
}
|
6132
6547
|
|
6133
6548
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6134
|
-
|
6135
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
6549
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6136
6550
|
}
|
6137
6551
|
|
6138
6552
|
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6139
|
-
|
6140
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
6553
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6141
6554
|
}
|
6142
6555
|
|
6143
6556
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6144
|
-
|
6145
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
6557
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6146
6558
|
}
|
6147
6559
|
|
6148
6560
|
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6149
|
-
|
6150
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
6561
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6151
6562
|
}
|
6152
6563
|
|
6153
6564
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6154
|
-
|
6155
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
6565
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6156
6566
|
}
|
6157
6567
|
|
6158
6568
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -6186,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6186
6596
|
|
6187
6597
|
const int64_t ne12 = src1->ne[2];
|
6188
6598
|
|
6189
|
-
CUDA_CHECK(
|
6190
|
-
cudaStream_t
|
6599
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6600
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6191
6601
|
|
6192
6602
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6193
6603
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6198,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6198
6608
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6199
6609
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6200
6610
|
|
6201
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12,
|
6611
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6202
6612
|
}
|
6203
6613
|
|
6204
6614
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -6217,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6217
6627
|
const int64_t nb01 = src0->nb[1];
|
6218
6628
|
const int64_t nb02 = src0->nb[2];
|
6219
6629
|
|
6220
|
-
CUDA_CHECK(
|
6221
|
-
cudaStream_t
|
6630
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6631
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6222
6632
|
|
6223
6633
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6224
6634
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6229,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6229
6639
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6230
6640
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6231
6641
|
|
6232
|
-
const
|
6233
|
-
const
|
6642
|
+
const int64_t row_stride_x = nb01 / sizeof(half);
|
6643
|
+
const int64_t channel_stride_x = nb02 / sizeof(half);
|
6234
6644
|
|
6235
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,
|
6645
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6236
6646
|
}
|
6237
6647
|
|
6238
6648
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6239
6649
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6240
6650
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6241
6651
|
|
6652
|
+
int64_t min_compute_capability = INT_MAX;
|
6653
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6654
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6655
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6656
|
+
min_compute_capability = g_compute_capabilities[id];
|
6657
|
+
}
|
6658
|
+
}
|
6659
|
+
|
6242
6660
|
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6243
6661
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6244
6662
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6245
6663
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6246
6664
|
}else if (src0->type == GGML_TYPE_F32) {
|
6247
|
-
|
6665
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6248
6666
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6249
6667
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
6250
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
6251
|
-
} else {
|
6252
|
-
int min_compute_capability = INT_MAX;
|
6253
|
-
for (int id = 0; id < g_device_count; ++id) {
|
6254
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6255
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6256
|
-
min_compute_capability = g_compute_capabilities[id];
|
6257
|
-
}
|
6258
|
-
}
|
6259
6668
|
|
6669
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
6670
|
+
const bool use_mul_mat_vec_q = false;
|
6671
|
+
#else
|
6672
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
6673
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
6674
|
+
|
6675
|
+
if (use_mul_mat_vec_q) {
|
6676
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
6677
|
+
} else {
|
6678
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
6679
|
+
}
|
6680
|
+
} else {
|
6260
6681
|
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
6261
|
-
|
6682
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
6262
6683
|
} else {
|
6263
|
-
|
6684
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6264
6685
|
}
|
6265
6686
|
}
|
6266
6687
|
} else {
|
@@ -6269,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6269
6690
|
}
|
6270
6691
|
|
6271
6692
|
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6272
|
-
|
6273
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
6693
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6274
6694
|
}
|
6275
6695
|
|
6276
6696
|
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6299,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6299
6719
|
const int64_t nb11 = src1->nb[1];
|
6300
6720
|
const int64_t nb12 = src1->nb[2];
|
6301
6721
|
|
6302
|
-
CUDA_CHECK(
|
6303
|
-
cudaStream_t
|
6722
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6723
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6304
6724
|
|
6305
6725
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6306
6726
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -6310,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6310
6730
|
|
6311
6731
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
6312
6732
|
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6313
|
-
ne10, ne11, nb10, nb11, nb12,
|
6733
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6314
6734
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
6315
6735
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6316
|
-
ne10, ne11, nb10, nb11, nb12,
|
6736
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6317
6737
|
} else {
|
6318
6738
|
GGML_ASSERT(false);
|
6319
6739
|
}
|
@@ -6327,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6327
6747
|
}
|
6328
6748
|
|
6329
6749
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
|
-
|
6331
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
6750
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6332
6751
|
}
|
6333
6752
|
|
6334
6753
|
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6335
|
-
|
6336
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
6754
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6337
6755
|
}
|
6338
6756
|
|
6339
6757
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6340
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
6758
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6342
|
-
|
6343
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
6344
|
-
const bool is_glm = mode & 4;
|
6345
|
-
|
6346
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6759
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6347
6760
|
}
|
6348
6761
|
|
6349
6762
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6350
|
-
|
6351
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6763
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6352
6764
|
}
|
6353
6765
|
|
6354
6766
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6358,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6358
6770
|
}
|
6359
6771
|
|
6360
6772
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
6361
|
-
|
6773
|
+
const int64_t nrows = ggml_nrows(tensor);
|
6362
6774
|
|
6363
6775
|
const int64_t ne0 = tensor->ne[0];
|
6364
6776
|
|
@@ -6368,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6368
6780
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6369
6781
|
memset(extra, 0, sizeof(*extra));
|
6370
6782
|
|
6371
|
-
for (
|
6783
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6372
6784
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
6373
6785
|
continue;
|
6374
6786
|
}
|
6375
6787
|
|
6376
|
-
|
6788
|
+
ggml_cuda_set_device(id);
|
6377
6789
|
|
6378
|
-
|
6790
|
+
int64_t row_low, row_high;
|
6379
6791
|
if (backend == GGML_BACKEND_GPU) {
|
6380
6792
|
row_low = 0;
|
6381
6793
|
row_high = nrows;
|
@@ -6425,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6425
6837
|
extra->data_device[id] = buf;
|
6426
6838
|
|
6427
6839
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6428
|
-
|
6840
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6841
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
6842
|
+
}
|
6429
6843
|
}
|
6430
6844
|
}
|
6431
6845
|
|
@@ -6439,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
6439
6853
|
|
6440
6854
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6441
6855
|
|
6442
|
-
for (
|
6856
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6443
6857
|
if (extra->data_device[id] != nullptr) {
|
6444
|
-
CUDA_CHECK(
|
6858
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6445
6859
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
6446
6860
|
}
|
6447
6861
|
|
6448
|
-
|
6449
|
-
|
6450
|
-
|
6862
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6863
|
+
if (extra->events[id][is] != nullptr) {
|
6864
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6865
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
6866
|
+
}
|
6451
6867
|
}
|
6452
6868
|
}
|
6453
6869
|
|
@@ -6499,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6499
6915
|
force_inplace;
|
6500
6916
|
const size_t size = ggml_nbytes(tensor);
|
6501
6917
|
|
6502
|
-
CUDA_CHECK(
|
6918
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6503
6919
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6504
6920
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6505
6921
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|