llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -13,7 +13,7 @@
|
|
13
13
|
#ifdef __HIP_PLATFORM_AMD__
|
14
14
|
// for rocblas_initialize()
|
15
15
|
#include "rocblas/rocblas.h"
|
16
|
-
#endif
|
16
|
+
#endif // __HIP_PLATFORM_AMD__
|
17
17
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
18
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
19
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
@@ -68,25 +68,52 @@
|
|
68
68
|
#include <cuda_runtime.h>
|
69
69
|
#include <cublas_v2.h>
|
70
70
|
#include <cuda_fp16.h>
|
71
|
-
#endif
|
71
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
72
72
|
|
73
73
|
#include "ggml-cuda.h"
|
74
74
|
#include "ggml.h"
|
75
75
|
|
76
|
-
#define MIN_CC_DP4A
|
77
|
-
#
|
78
|
-
#define
|
79
|
-
#
|
76
|
+
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#define CC_TURING 700
|
78
|
+
#define CC_OFFSET_AMD 1000000
|
79
|
+
#define CC_RDNA2 CC_OFFSET_AMD + 1030
|
80
80
|
|
81
81
|
#if defined(GGML_USE_HIPBLAS)
|
82
82
|
#define __CUDA_ARCH__ 1300
|
83
83
|
|
84
|
+
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
85
|
+
defined(__gfx1150__) || defined(__gfx1151__)
|
86
|
+
#define RDNA3
|
87
|
+
#endif
|
88
|
+
|
89
|
+
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
90
|
+
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
91
|
+
#define RDNA2
|
92
|
+
#endif
|
93
|
+
|
94
|
+
#ifndef __has_builtin
|
95
|
+
#define __has_builtin(x) 0
|
96
|
+
#endif
|
97
|
+
|
84
98
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
99
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
100
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
101
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
102
|
+
#if __has_builtin(__builtin_elementwise_sub_sat)
|
88
103
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
104
|
return reinterpret_cast<const int&>(c);
|
105
|
+
#else
|
106
|
+
int8x4_t c;
|
107
|
+
int16_t tmp;
|
108
|
+
#pragma unroll
|
109
|
+
for (int i = 0; i < 4; i++) {
|
110
|
+
tmp = va[i] - vb[i];
|
111
|
+
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
112
|
+
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
113
|
+
c[i] = tmp;
|
114
|
+
}
|
115
|
+
return reinterpret_cast<int&>(c);
|
116
|
+
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
90
117
|
}
|
91
118
|
|
92
119
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
@@ -115,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|
115
142
|
#endif
|
116
143
|
return c;
|
117
144
|
}
|
118
|
-
#endif
|
145
|
+
#endif // defined(GGML_USE_HIPBLAS)
|
119
146
|
|
120
147
|
#if defined(_MSC_VER)
|
121
148
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -127,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
127
154
|
do { \
|
128
155
|
cudaError_t err_ = (err); \
|
129
156
|
if (err_ != cudaSuccess) { \
|
130
|
-
|
157
|
+
int id; \
|
158
|
+
cudaGetDevice(&id); \
|
159
|
+
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
131
160
|
cudaGetErrorString(err_)); \
|
161
|
+
fprintf(stderr, "current device: %d\n", id); \
|
132
162
|
exit(1); \
|
133
163
|
} \
|
134
164
|
} while (0)
|
@@ -138,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
138
168
|
do { \
|
139
169
|
cublasStatus_t err_ = (err); \
|
140
170
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
171
|
+
int id; \
|
172
|
+
cudaGetDevice(&id); \
|
141
173
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
142
174
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
175
|
+
fprintf(stderr, "current device: %d\n", id); \
|
143
176
|
exit(1); \
|
144
177
|
} \
|
145
178
|
} while (0)
|
@@ -148,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
148
181
|
do { \
|
149
182
|
cublasStatus_t err_ = (err); \
|
150
183
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
184
|
+
int id; \
|
185
|
+
cudaGetDevice(&id); \
|
151
186
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
187
|
+
fprintf(stderr, "current device: %d\n", id); \
|
152
188
|
exit(1); \
|
153
189
|
} \
|
154
190
|
} while (0)
|
@@ -195,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
|
|
195
231
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
196
232
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
197
233
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
198
|
-
typedef void (*
|
199
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
200
|
-
|
201
|
-
cudaStream_t &
|
234
|
+
typedef void (*ggml_cuda_op_mul_mat_t)(
|
235
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
236
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
237
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream);
|
238
|
+
typedef void (*ggml_cuda_op_flatten_t)(
|
239
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
240
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
|
202
241
|
|
203
242
|
// QK = number of values after dequantization
|
204
243
|
// QR = QK / number of values before dequantization
|
@@ -379,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
379
418
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
380
419
|
#endif
|
381
420
|
|
421
|
+
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
|
+
|
423
|
+
#define MAX_STREAMS 8
|
424
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
425
|
+
|
382
426
|
struct ggml_tensor_extra_gpu {
|
383
427
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
384
|
-
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
428
|
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
|
385
429
|
};
|
386
430
|
|
431
|
+
// this is faster on Windows
|
432
|
+
// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
|
433
|
+
inline cudaError_t ggml_cuda_set_device(const int device) {
|
434
|
+
int current_device;
|
435
|
+
CUDA_CHECK(cudaGetDevice(¤t_device));
|
436
|
+
|
437
|
+
if (device == current_device) {
|
438
|
+
return cudaSuccess;
|
439
|
+
}
|
440
|
+
|
441
|
+
return cudaSetDevice(device);
|
442
|
+
}
|
443
|
+
|
387
444
|
static int g_device_count = -1;
|
388
445
|
static int g_main_device = 0;
|
389
446
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
@@ -396,8 +453,6 @@ static size_t g_scratch_offset = 0;
|
|
396
453
|
|
397
454
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
398
455
|
|
399
|
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
400
|
-
|
401
456
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
402
457
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
403
458
|
|
@@ -447,58 +502,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
447
502
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
448
503
|
}
|
449
504
|
|
505
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
509
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
510
|
+
}
|
511
|
+
return a;
|
512
|
+
}
|
513
|
+
|
514
|
+
template <int block_size>
|
450
515
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
451
516
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
452
517
|
const int tid = threadIdx.x;
|
453
518
|
|
454
519
|
const float eps = 1e-5f;
|
455
520
|
|
456
|
-
|
457
|
-
float var = 0.0f;
|
521
|
+
float2 mean_var = make_float2(0.f, 0.f);
|
458
522
|
|
459
|
-
for (int col = tid; col < ncols; col +=
|
523
|
+
for (int col = tid; col < ncols; col += block_size) {
|
460
524
|
const float xi = x[row*ncols + col];
|
461
|
-
|
462
|
-
|
525
|
+
mean_var.x += xi;
|
526
|
+
mean_var.y += xi * xi;
|
463
527
|
}
|
464
528
|
|
465
529
|
// sum up partial sums
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
530
|
+
mean_var = warp_reduce_sum(mean_var);
|
531
|
+
if (block_size > WARP_SIZE) {
|
532
|
+
__shared__ float2 s_sum[32];
|
533
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
534
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
535
|
+
if (lane_id == 0) {
|
536
|
+
s_sum[warp_id] = mean_var;
|
537
|
+
}
|
538
|
+
__syncthreads();
|
539
|
+
mean_var = s_sum[lane_id];
|
540
|
+
mean_var = warp_reduce_sum(mean_var);
|
470
541
|
}
|
471
542
|
|
472
|
-
mean
|
473
|
-
var =
|
474
|
-
const float
|
543
|
+
const float mean = mean_var.x / ncols;
|
544
|
+
const float var = mean_var.y / ncols - mean * mean;
|
545
|
+
const float inv_std = rsqrtf(var + eps);
|
475
546
|
|
476
|
-
for (int col = tid; col < ncols; col +=
|
477
|
-
dst[row*ncols + col] = (x[row*ncols + col] - mean) *
|
547
|
+
for (int col = tid; col < ncols; col += block_size) {
|
548
|
+
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
553
|
+
#pragma unroll
|
554
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
555
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
478
556
|
}
|
557
|
+
return x;
|
479
558
|
}
|
480
559
|
|
560
|
+
template <int block_size>
|
481
561
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
482
562
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
483
563
|
const int tid = threadIdx.x;
|
484
564
|
|
485
565
|
float tmp = 0.0f; // partial sum for thread in warp
|
486
566
|
|
487
|
-
for (int col = tid; col < ncols; col +=
|
567
|
+
for (int col = tid; col < ncols; col += block_size) {
|
488
568
|
const float xi = x[row*ncols + col];
|
489
569
|
tmp += xi * xi;
|
490
570
|
}
|
491
571
|
|
492
572
|
// sum up partial sums
|
493
|
-
|
494
|
-
|
495
|
-
|
573
|
+
tmp = warp_reduce_sum(tmp);
|
574
|
+
if (block_size > WARP_SIZE) {
|
575
|
+
__shared__ float s_sum[32];
|
576
|
+
int warp_id = threadIdx.x / WARP_SIZE;
|
577
|
+
int lane_id = threadIdx.x % WARP_SIZE;
|
578
|
+
if (lane_id == 0) {
|
579
|
+
s_sum[warp_id] = tmp;
|
580
|
+
}
|
581
|
+
__syncthreads();
|
582
|
+
tmp = s_sum[lane_id];
|
583
|
+
tmp = warp_reduce_sum(tmp);
|
496
584
|
}
|
497
585
|
|
498
586
|
const float mean = tmp / ncols;
|
499
587
|
const float scale = rsqrtf(mean + eps);
|
500
588
|
|
501
|
-
for (int col = tid; col < ncols; col +=
|
589
|
+
for (int col = tid; col < ncols; col += block_size) {
|
502
590
|
dst[row*ncols + col] = scale * x[row*ncols + col];
|
503
591
|
}
|
504
592
|
}
|
@@ -3394,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3394
3482
|
}
|
3395
3483
|
}
|
3396
3484
|
|
3485
|
+
#define MMQ_X_Q4_0_RDNA2 64
|
3486
|
+
#define MMQ_Y_Q4_0_RDNA2 128
|
3487
|
+
#define NWARPS_Q4_0_RDNA2 8
|
3488
|
+
#define MMQ_X_Q4_0_RDNA1 64
|
3489
|
+
#define MMQ_Y_Q4_0_RDNA1 64
|
3490
|
+
#define NWARPS_Q4_0_RDNA1 8
|
3397
3491
|
#define MMQ_X_Q4_0_AMPERE 64
|
3398
3492
|
#define MMQ_Y_Q4_0_AMPERE 128
|
3399
3493
|
#define NWARPS_Q4_0_AMPERE 4
|
@@ -3401,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3401
3495
|
#define MMQ_Y_Q4_0_PASCAL 64
|
3402
3496
|
#define NWARPS_Q4_0_PASCAL 8
|
3403
3497
|
|
3404
|
-
template <bool need_check> static __global__ void
|
3498
|
+
template <bool need_check> static __global__ void
|
3499
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3500
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3501
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
|
3502
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3503
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3504
|
+
mul_mat_q4_0(
|
3405
3505
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3406
3506
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3407
3507
|
|
3408
|
-
#if
|
3508
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3509
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3510
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA2;
|
3511
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
|
3512
|
+
const int nwarps = NWARPS_Q4_0_RDNA2;
|
3513
|
+
#else
|
3514
|
+
const int mmq_x = MMQ_X_Q4_0_RDNA1;
|
3515
|
+
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
|
3516
|
+
const int nwarps = NWARPS_Q4_0_RDNA1;
|
3517
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3518
|
+
|
3519
|
+
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
|
3520
|
+
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
|
3521
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3522
|
+
|
3523
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3409
3524
|
const int mmq_x = MMQ_X_Q4_0_AMPERE;
|
3410
3525
|
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
|
3411
3526
|
const int nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -3428,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3428
3543
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3429
3544
|
}
|
3430
3545
|
|
3546
|
+
#define MMQ_X_Q4_1_RDNA2 64
|
3547
|
+
#define MMQ_Y_Q4_1_RDNA2 128
|
3548
|
+
#define NWARPS_Q4_1_RDNA2 8
|
3549
|
+
#define MMQ_X_Q4_1_RDNA1 64
|
3550
|
+
#define MMQ_Y_Q4_1_RDNA1 64
|
3551
|
+
#define NWARPS_Q4_1_RDNA1 8
|
3431
3552
|
#define MMQ_X_Q4_1_AMPERE 64
|
3432
3553
|
#define MMQ_Y_Q4_1_AMPERE 128
|
3433
3554
|
#define NWARPS_Q4_1_AMPERE 4
|
@@ -3436,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
|
|
3436
3557
|
#define NWARPS_Q4_1_PASCAL 8
|
3437
3558
|
|
3438
3559
|
template <bool need_check> static __global__ void
|
3439
|
-
#if
|
3560
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3561
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3562
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
|
3563
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3564
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3440
3565
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
|
3441
3566
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3442
3567
|
mul_mat_q4_1(
|
3443
3568
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3444
3569
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3445
3570
|
|
3446
|
-
#if
|
3571
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3572
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3573
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA2;
|
3574
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
|
3575
|
+
const int nwarps = NWARPS_Q4_1_RDNA2;
|
3576
|
+
#else
|
3577
|
+
const int mmq_x = MMQ_X_Q4_1_RDNA1;
|
3578
|
+
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
|
3579
|
+
const int nwarps = NWARPS_Q4_1_RDNA1;
|
3580
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3581
|
+
|
3582
|
+
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
|
3583
|
+
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
|
3584
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3585
|
+
|
3586
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3447
3587
|
const int mmq_x = MMQ_X_Q4_1_AMPERE;
|
3448
3588
|
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
|
3449
3589
|
const int nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -3466,6 +3606,12 @@ template <bool need_check> static __global__ void
|
|
3466
3606
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3467
3607
|
}
|
3468
3608
|
|
3609
|
+
#define MMQ_X_Q5_0_RDNA2 64
|
3610
|
+
#define MMQ_Y_Q5_0_RDNA2 128
|
3611
|
+
#define NWARPS_Q5_0_RDNA2 8
|
3612
|
+
#define MMQ_X_Q5_0_RDNA1 64
|
3613
|
+
#define MMQ_Y_Q5_0_RDNA1 64
|
3614
|
+
#define NWARPS_Q5_0_RDNA1 8
|
3469
3615
|
#define MMQ_X_Q5_0_AMPERE 128
|
3470
3616
|
#define MMQ_Y_Q5_0_AMPERE 64
|
3471
3617
|
#define NWARPS_Q5_0_AMPERE 4
|
@@ -3473,11 +3619,32 @@ template <bool need_check> static __global__ void
|
|
3473
3619
|
#define MMQ_Y_Q5_0_PASCAL 64
|
3474
3620
|
#define NWARPS_Q5_0_PASCAL 8
|
3475
3621
|
|
3476
|
-
template <bool need_check> static __global__ void
|
3622
|
+
template <bool need_check> static __global__ void
|
3623
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3624
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3625
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
|
3626
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3627
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3628
|
+
mul_mat_q5_0(
|
3477
3629
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3478
3630
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3479
3631
|
|
3480
|
-
#if
|
3632
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3633
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3634
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA2;
|
3635
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
|
3636
|
+
const int nwarps = NWARPS_Q5_0_RDNA2;
|
3637
|
+
#else
|
3638
|
+
const int mmq_x = MMQ_X_Q5_0_RDNA1;
|
3639
|
+
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
|
3640
|
+
const int nwarps = NWARPS_Q5_0_RDNA1;
|
3641
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3642
|
+
|
3643
|
+
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
|
3644
|
+
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
|
3645
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3646
|
+
|
3647
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3481
3648
|
const int mmq_x = MMQ_X_Q5_0_AMPERE;
|
3482
3649
|
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
|
3483
3650
|
const int nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -3500,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3500
3667
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3501
3668
|
}
|
3502
3669
|
|
3670
|
+
#define MMQ_X_Q5_1_RDNA2 64
|
3671
|
+
#define MMQ_Y_Q5_1_RDNA2 128
|
3672
|
+
#define NWARPS_Q5_1_RDNA2 8
|
3673
|
+
#define MMQ_X_Q5_1_RDNA1 64
|
3674
|
+
#define MMQ_Y_Q5_1_RDNA1 64
|
3675
|
+
#define NWARPS_Q5_1_RDNA1 8
|
3503
3676
|
#define MMQ_X_Q5_1_AMPERE 128
|
3504
3677
|
#define MMQ_Y_Q5_1_AMPERE 64
|
3505
3678
|
#define NWARPS_Q5_1_AMPERE 4
|
@@ -3507,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
|
|
3507
3680
|
#define MMQ_Y_Q5_1_PASCAL 64
|
3508
3681
|
#define NWARPS_Q5_1_PASCAL 8
|
3509
3682
|
|
3510
|
-
template <bool need_check> static __global__ void
|
3683
|
+
template <bool need_check> static __global__ void
|
3684
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3685
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3686
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
|
3687
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3688
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3689
|
+
mul_mat_q5_1(
|
3511
3690
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3512
3691
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3513
3692
|
|
3514
|
-
#if
|
3693
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3694
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3695
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA2;
|
3696
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
|
3697
|
+
const int nwarps = NWARPS_Q5_1_RDNA2;
|
3698
|
+
#else
|
3699
|
+
const int mmq_x = MMQ_X_Q5_1_RDNA1;
|
3700
|
+
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
|
3701
|
+
const int nwarps = NWARPS_Q5_1_RDNA1;
|
3702
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3703
|
+
|
3704
|
+
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
|
3705
|
+
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
|
3706
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3707
|
+
|
3708
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3515
3709
|
const int mmq_x = MMQ_X_Q5_1_AMPERE;
|
3516
3710
|
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
|
3517
3711
|
const int nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -3534,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3534
3728
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3535
3729
|
}
|
3536
3730
|
|
3731
|
+
#define MMQ_X_Q8_0_RDNA2 64
|
3732
|
+
#define MMQ_Y_Q8_0_RDNA2 128
|
3733
|
+
#define NWARPS_Q8_0_RDNA2 8
|
3734
|
+
#define MMQ_X_Q8_0_RDNA1 64
|
3735
|
+
#define MMQ_Y_Q8_0_RDNA1 64
|
3736
|
+
#define NWARPS_Q8_0_RDNA1 8
|
3537
3737
|
#define MMQ_X_Q8_0_AMPERE 128
|
3538
3738
|
#define MMQ_Y_Q8_0_AMPERE 64
|
3539
3739
|
#define NWARPS_Q8_0_AMPERE 4
|
@@ -3541,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
|
|
3541
3741
|
#define MMQ_Y_Q8_0_PASCAL 64
|
3542
3742
|
#define NWARPS_Q8_0_PASCAL 8
|
3543
3743
|
|
3544
|
-
template <bool need_check> static __global__ void
|
3744
|
+
template <bool need_check> static __global__ void
|
3745
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3746
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3747
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
|
3748
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3749
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3750
|
+
mul_mat_q8_0(
|
3545
3751
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3546
3752
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3547
3753
|
|
3548
|
-
#if
|
3754
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3755
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3756
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA2;
|
3757
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
|
3758
|
+
const int nwarps = NWARPS_Q8_0_RDNA2;
|
3759
|
+
#else
|
3760
|
+
const int mmq_x = MMQ_X_Q8_0_RDNA1;
|
3761
|
+
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
|
3762
|
+
const int nwarps = NWARPS_Q8_0_RDNA1;
|
3763
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3764
|
+
|
3765
|
+
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
|
3766
|
+
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
|
3767
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3768
|
+
|
3769
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3549
3770
|
const int mmq_x = MMQ_X_Q8_0_AMPERE;
|
3550
3771
|
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
|
3551
3772
|
const int nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -3568,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3568
3789
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3569
3790
|
}
|
3570
3791
|
|
3792
|
+
#define MMQ_X_Q2_K_RDNA2 64
|
3793
|
+
#define MMQ_Y_Q2_K_RDNA2 128
|
3794
|
+
#define NWARPS_Q2_K_RDNA2 8
|
3795
|
+
#define MMQ_X_Q2_K_RDNA1 128
|
3796
|
+
#define MMQ_Y_Q2_K_RDNA1 32
|
3797
|
+
#define NWARPS_Q2_K_RDNA1 8
|
3571
3798
|
#define MMQ_X_Q2_K_AMPERE 64
|
3572
3799
|
#define MMQ_Y_Q2_K_AMPERE 128
|
3573
3800
|
#define NWARPS_Q2_K_AMPERE 4
|
@@ -3575,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
|
|
3575
3802
|
#define MMQ_Y_Q2_K_PASCAL 64
|
3576
3803
|
#define NWARPS_Q2_K_PASCAL 8
|
3577
3804
|
|
3578
|
-
template <bool need_check> static __global__ void
|
3805
|
+
template <bool need_check> static __global__ void
|
3806
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3807
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3808
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
|
3809
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3810
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3811
|
+
mul_mat_q2_K(
|
3579
3812
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3580
3813
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3581
3814
|
|
3582
|
-
#if
|
3815
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3816
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3817
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA2;
|
3818
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
|
3819
|
+
const int nwarps = NWARPS_Q2_K_RDNA2;
|
3820
|
+
#else
|
3821
|
+
const int mmq_x = MMQ_X_Q2_K_RDNA1;
|
3822
|
+
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
|
3823
|
+
const int nwarps = NWARPS_Q2_K_RDNA1;
|
3824
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3825
|
+
|
3826
|
+
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
|
3827
|
+
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
|
3828
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3829
|
+
|
3830
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3583
3831
|
const int mmq_x = MMQ_X_Q2_K_AMPERE;
|
3584
3832
|
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
|
3585
3833
|
const int nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -3602,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3602
3850
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3603
3851
|
}
|
3604
3852
|
|
3853
|
+
#define MMQ_X_Q3_K_RDNA2 128
|
3854
|
+
#define MMQ_Y_Q3_K_RDNA2 64
|
3855
|
+
#define NWARPS_Q3_K_RDNA2 8
|
3856
|
+
#define MMQ_X_Q3_K_RDNA1 32
|
3857
|
+
#define MMQ_Y_Q3_K_RDNA1 128
|
3858
|
+
#define NWARPS_Q3_K_RDNA1 8
|
3605
3859
|
#define MMQ_X_Q3_K_AMPERE 128
|
3606
3860
|
#define MMQ_Y_Q3_K_AMPERE 128
|
3607
3861
|
#define NWARPS_Q3_K_AMPERE 4
|
@@ -3610,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
|
|
3610
3864
|
#define NWARPS_Q3_K_PASCAL 8
|
3611
3865
|
|
3612
3866
|
template <bool need_check> static __global__ void
|
3613
|
-
#if
|
3867
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3868
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3869
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
|
3870
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3871
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3614
3872
|
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
|
3615
3873
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3616
3874
|
mul_mat_q3_K(
|
3617
3875
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3618
3876
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3619
3877
|
|
3620
|
-
#if
|
3878
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3879
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3880
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA2;
|
3881
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
|
3882
|
+
const int nwarps = NWARPS_Q3_K_RDNA2;
|
3883
|
+
#else
|
3884
|
+
const int mmq_x = MMQ_X_Q3_K_RDNA1;
|
3885
|
+
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
|
3886
|
+
const int nwarps = NWARPS_Q3_K_RDNA1;
|
3887
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3888
|
+
|
3889
|
+
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
|
3890
|
+
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
|
3891
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3892
|
+
|
3893
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3621
3894
|
const int mmq_x = MMQ_X_Q3_K_AMPERE;
|
3622
3895
|
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
|
3623
3896
|
const int nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -3640,6 +3913,12 @@ template <bool need_check> static __global__ void
|
|
3640
3913
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3641
3914
|
}
|
3642
3915
|
|
3916
|
+
#define MMQ_X_Q4_K_RDNA2 64
|
3917
|
+
#define MMQ_Y_Q4_K_RDNA2 128
|
3918
|
+
#define NWARPS_Q4_K_RDNA2 8
|
3919
|
+
#define MMQ_X_Q4_K_RDNA1 32
|
3920
|
+
#define MMQ_Y_Q4_K_RDNA1 64
|
3921
|
+
#define NWARPS_Q4_K_RDNA1 8
|
3643
3922
|
#define MMQ_X_Q4_K_AMPERE 64
|
3644
3923
|
#define MMQ_Y_Q4_K_AMPERE 128
|
3645
3924
|
#define NWARPS_Q4_K_AMPERE 4
|
@@ -3648,14 +3927,33 @@ template <bool need_check> static __global__ void
|
|
3648
3927
|
#define NWARPS_Q4_K_PASCAL 8
|
3649
3928
|
|
3650
3929
|
template <bool need_check> static __global__ void
|
3651
|
-
#if
|
3930
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3931
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3932
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
|
3933
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3934
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3652
3935
|
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
|
3653
3936
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3654
3937
|
mul_mat_q4_K(
|
3655
3938
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3656
3939
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3657
3940
|
|
3658
|
-
#if
|
3941
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3942
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3943
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA2;
|
3944
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
|
3945
|
+
const int nwarps = NWARPS_Q4_K_RDNA2;
|
3946
|
+
#else
|
3947
|
+
const int mmq_x = MMQ_X_Q4_K_RDNA1;
|
3948
|
+
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
|
3949
|
+
const int nwarps = NWARPS_Q4_K_RDNA1;
|
3950
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3951
|
+
|
3952
|
+
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
|
3953
|
+
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
|
3954
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
3955
|
+
|
3956
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3659
3957
|
const int mmq_x = MMQ_X_Q4_K_AMPERE;
|
3660
3958
|
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
|
3661
3959
|
const int nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -3678,6 +3976,12 @@ template <bool need_check> static __global__ void
|
|
3678
3976
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3679
3977
|
}
|
3680
3978
|
|
3979
|
+
#define MMQ_X_Q5_K_RDNA2 64
|
3980
|
+
#define MMQ_Y_Q5_K_RDNA2 128
|
3981
|
+
#define NWARPS_Q5_K_RDNA2 8
|
3982
|
+
#define MMQ_X_Q5_K_RDNA1 32
|
3983
|
+
#define MMQ_Y_Q5_K_RDNA1 64
|
3984
|
+
#define NWARPS_Q5_K_RDNA1 8
|
3681
3985
|
#define MMQ_X_Q5_K_AMPERE 64
|
3682
3986
|
#define MMQ_Y_Q5_K_AMPERE 128
|
3683
3987
|
#define NWARPS_Q5_K_AMPERE 4
|
@@ -3685,11 +3989,32 @@ template <bool need_check> static __global__ void
|
|
3685
3989
|
#define MMQ_Y_Q5_K_PASCAL 64
|
3686
3990
|
#define NWARPS_Q5_K_PASCAL 8
|
3687
3991
|
|
3688
|
-
template <bool need_check> static __global__ void
|
3992
|
+
template <bool need_check> static __global__ void
|
3993
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3994
|
+
#if defined(RDNA3) || defined(RDNA2)
|
3995
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
|
3996
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
3997
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
3998
|
+
mul_mat_q5_K(
|
3689
3999
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3690
4000
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3691
4001
|
|
3692
|
-
#if
|
4002
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4003
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4004
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA2;
|
4005
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
|
4006
|
+
const int nwarps = NWARPS_Q5_K_RDNA2;
|
4007
|
+
#else
|
4008
|
+
const int mmq_x = MMQ_X_Q5_K_RDNA1;
|
4009
|
+
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
|
4010
|
+
const int nwarps = NWARPS_Q5_K_RDNA1;
|
4011
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4012
|
+
|
4013
|
+
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
|
4014
|
+
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
|
4015
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4016
|
+
|
4017
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3693
4018
|
const int mmq_x = MMQ_X_Q5_K_AMPERE;
|
3694
4019
|
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
|
3695
4020
|
const int nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -3712,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3712
4037
|
#endif // __CUDA_ARCH__ >= CC_TURING
|
3713
4038
|
}
|
3714
4039
|
|
4040
|
+
#define MMQ_X_Q6_K_RDNA2 64
|
4041
|
+
#define MMQ_Y_Q6_K_RDNA2 128
|
4042
|
+
#define NWARPS_Q6_K_RDNA2 8
|
4043
|
+
#define MMQ_X_Q6_K_RDNA1 32
|
4044
|
+
#define MMQ_Y_Q6_K_RDNA1 64
|
4045
|
+
#define NWARPS_Q6_K_RDNA1 8
|
3715
4046
|
#define MMQ_X_Q6_K_AMPERE 64
|
3716
4047
|
#define MMQ_Y_Q6_K_AMPERE 64
|
3717
4048
|
#define NWARPS_Q6_K_AMPERE 4
|
@@ -3720,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
|
|
3720
4051
|
#define NWARPS_Q6_K_PASCAL 8
|
3721
4052
|
|
3722
4053
|
template <bool need_check> static __global__ void
|
3723
|
-
#if
|
4054
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4055
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4056
|
+
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
|
4057
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4058
|
+
#elif __CUDA_ARCH__ < CC_TURING
|
3724
4059
|
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
|
3725
4060
|
#endif // __CUDA_ARCH__ < CC_TURING
|
3726
4061
|
mul_mat_q6_K(
|
3727
4062
|
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
3728
4063
|
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
|
3729
4064
|
|
3730
|
-
#if
|
4065
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
4066
|
+
#if defined(RDNA3) || defined(RDNA2)
|
4067
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA2;
|
4068
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
|
4069
|
+
const int nwarps = NWARPS_Q6_K_RDNA2;
|
4070
|
+
#else
|
4071
|
+
const int mmq_x = MMQ_X_Q6_K_RDNA1;
|
4072
|
+
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
|
4073
|
+
const int nwarps = NWARPS_Q6_K_RDNA1;
|
4074
|
+
#endif // defined(RDNA3) || defined(RDNA2)
|
4075
|
+
|
4076
|
+
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
|
4077
|
+
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
|
4078
|
+
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4079
|
+
|
4080
|
+
#elif __CUDA_ARCH__ >= CC_TURING
|
3731
4081
|
const int mmq_x = MMQ_X_Q6_K_AMPERE;
|
3732
4082
|
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
|
3733
4083
|
const int nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4036,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4036
4386
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4037
4387
|
}
|
4038
4388
|
|
4039
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4389
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
4390
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4040
4391
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4041
4392
|
const int half_n_dims = ncols/4;
|
4042
4393
|
|
@@ -4048,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4048
4399
|
const int i = row*ncols + col;
|
4049
4400
|
|
4050
4401
|
const float col_theta_scale = powf(theta_scale, col);
|
4402
|
+
const float p = p0 + p_delta*(row/p_delta_rows);
|
4051
4403
|
|
4052
|
-
const float theta = p*col_theta_scale;
|
4404
|
+
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
4053
4405
|
const float sin_theta = sinf(theta);
|
4054
4406
|
const float cos_theta = cosf(theta);
|
4055
4407
|
|
@@ -4059,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4059
4411
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4060
4412
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4061
4413
|
|
4062
|
-
const float block_theta =
|
4414
|
+
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
4063
4415
|
const float sin_block_theta = sinf(block_theta);
|
4064
4416
|
const float cos_block_theta = cosf(block_theta);
|
4065
4417
|
|
@@ -4186,14 +4538,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4186
4538
|
|
4187
4539
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4188
4540
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4189
|
-
|
4190
|
-
|
4541
|
+
if (ncols < 1024) {
|
4542
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4543
|
+
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4544
|
+
} else {
|
4545
|
+
const dim3 block_dims(1024, 1, 1);
|
4546
|
+
norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
4547
|
+
}
|
4191
4548
|
}
|
4192
4549
|
|
4193
4550
|
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
4194
4551
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4195
|
-
|
4196
|
-
|
4552
|
+
if (ncols < 1024) {
|
4553
|
+
const dim3 block_dims(WARP_SIZE, 1, 1);
|
4554
|
+
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4555
|
+
} else {
|
4556
|
+
const dim3 block_dims(1024, 1, 1);
|
4557
|
+
rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
4558
|
+
}
|
4197
4559
|
}
|
4198
4560
|
|
4199
4561
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
@@ -4498,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
|
4498
4860
|
const int compute_capability = g_compute_capabilities[id];
|
4499
4861
|
|
4500
4862
|
int mmq_x, mmq_y, nwarps;
|
4501
|
-
if (compute_capability >=
|
4863
|
+
if (compute_capability >= CC_RDNA2) {
|
4864
|
+
mmq_x = MMQ_X_Q4_0_RDNA2;
|
4865
|
+
mmq_y = MMQ_Y_Q4_0_RDNA2;
|
4866
|
+
nwarps = NWARPS_Q4_0_RDNA2;
|
4867
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4868
|
+
mmq_x = MMQ_X_Q4_0_RDNA1;
|
4869
|
+
mmq_y = MMQ_Y_Q4_0_RDNA1;
|
4870
|
+
nwarps = NWARPS_Q4_0_RDNA1;
|
4871
|
+
} else if (compute_capability >= CC_TURING) {
|
4502
4872
|
mmq_x = MMQ_X_Q4_0_AMPERE;
|
4503
4873
|
mmq_y = MMQ_Y_Q4_0_AMPERE;
|
4504
4874
|
nwarps = NWARPS_Q4_0_AMPERE;
|
@@ -4535,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
|
4535
4905
|
const int compute_capability = g_compute_capabilities[id];
|
4536
4906
|
|
4537
4907
|
int mmq_x, mmq_y, nwarps;
|
4538
|
-
if (compute_capability >=
|
4908
|
+
if (compute_capability >= CC_RDNA2) {
|
4909
|
+
mmq_x = MMQ_X_Q4_1_RDNA2;
|
4910
|
+
mmq_y = MMQ_Y_Q4_1_RDNA2;
|
4911
|
+
nwarps = NWARPS_Q4_1_RDNA2;
|
4912
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4913
|
+
mmq_x = MMQ_X_Q4_1_RDNA1;
|
4914
|
+
mmq_y = MMQ_Y_Q4_1_RDNA1;
|
4915
|
+
nwarps = NWARPS_Q4_1_RDNA1;
|
4916
|
+
} else if (compute_capability >= CC_TURING) {
|
4539
4917
|
mmq_x = MMQ_X_Q4_1_AMPERE;
|
4540
4918
|
mmq_y = MMQ_Y_Q4_1_AMPERE;
|
4541
4919
|
nwarps = NWARPS_Q4_1_AMPERE;
|
@@ -4572,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
|
4572
4950
|
const int compute_capability = g_compute_capabilities[id];
|
4573
4951
|
|
4574
4952
|
int mmq_x, mmq_y, nwarps;
|
4575
|
-
if (compute_capability >=
|
4953
|
+
if (compute_capability >= CC_RDNA2) {
|
4954
|
+
mmq_x = MMQ_X_Q5_0_RDNA2;
|
4955
|
+
mmq_y = MMQ_Y_Q5_0_RDNA2;
|
4956
|
+
nwarps = NWARPS_Q5_0_RDNA2;
|
4957
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
4958
|
+
mmq_x = MMQ_X_Q5_0_RDNA1;
|
4959
|
+
mmq_y = MMQ_Y_Q5_0_RDNA1;
|
4960
|
+
nwarps = NWARPS_Q5_0_RDNA1;
|
4961
|
+
} else if (compute_capability >= CC_TURING) {
|
4576
4962
|
mmq_x = MMQ_X_Q5_0_AMPERE;
|
4577
4963
|
mmq_y = MMQ_Y_Q5_0_AMPERE;
|
4578
4964
|
nwarps = NWARPS_Q5_0_AMPERE;
|
@@ -4609,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
|
4609
4995
|
const int compute_capability = g_compute_capabilities[id];
|
4610
4996
|
|
4611
4997
|
int mmq_x, mmq_y, nwarps;
|
4612
|
-
if (compute_capability >=
|
4998
|
+
if (compute_capability >= CC_RDNA2) {
|
4999
|
+
mmq_x = MMQ_X_Q5_1_RDNA2;
|
5000
|
+
mmq_y = MMQ_Y_Q5_1_RDNA2;
|
5001
|
+
nwarps = NWARPS_Q5_1_RDNA2;
|
5002
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5003
|
+
mmq_x = MMQ_X_Q5_1_RDNA1;
|
5004
|
+
mmq_y = MMQ_Y_Q5_1_RDNA1;
|
5005
|
+
nwarps = NWARPS_Q5_1_RDNA1;
|
5006
|
+
} else if (compute_capability >= CC_TURING) {
|
4613
5007
|
mmq_x = MMQ_X_Q5_1_AMPERE;
|
4614
5008
|
mmq_y = MMQ_Y_Q5_1_AMPERE;
|
4615
5009
|
nwarps = NWARPS_Q5_1_AMPERE;
|
@@ -4646,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
|
4646
5040
|
const int compute_capability = g_compute_capabilities[id];
|
4647
5041
|
|
4648
5042
|
int mmq_x, mmq_y, nwarps;
|
4649
|
-
if (compute_capability >=
|
5043
|
+
if (compute_capability >= CC_RDNA2) {
|
5044
|
+
mmq_x = MMQ_X_Q8_0_RDNA2;
|
5045
|
+
mmq_y = MMQ_Y_Q8_0_RDNA2;
|
5046
|
+
nwarps = NWARPS_Q8_0_RDNA2;
|
5047
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5048
|
+
mmq_x = MMQ_X_Q8_0_RDNA1;
|
5049
|
+
mmq_y = MMQ_Y_Q8_0_RDNA1;
|
5050
|
+
nwarps = NWARPS_Q8_0_RDNA1;
|
5051
|
+
} else if (compute_capability >= CC_TURING) {
|
4650
5052
|
mmq_x = MMQ_X_Q8_0_AMPERE;
|
4651
5053
|
mmq_y = MMQ_Y_Q8_0_AMPERE;
|
4652
5054
|
nwarps = NWARPS_Q8_0_AMPERE;
|
@@ -4683,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
|
4683
5085
|
const int compute_capability = g_compute_capabilities[id];
|
4684
5086
|
|
4685
5087
|
int mmq_x, mmq_y, nwarps;
|
4686
|
-
if (compute_capability >=
|
5088
|
+
if (compute_capability >= CC_RDNA2) {
|
5089
|
+
mmq_x = MMQ_X_Q2_K_RDNA2;
|
5090
|
+
mmq_y = MMQ_Y_Q2_K_RDNA2;
|
5091
|
+
nwarps = NWARPS_Q2_K_RDNA2;
|
5092
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5093
|
+
mmq_x = MMQ_X_Q2_K_RDNA1;
|
5094
|
+
mmq_y = MMQ_Y_Q2_K_RDNA1;
|
5095
|
+
nwarps = NWARPS_Q2_K_RDNA1;
|
5096
|
+
} else if (compute_capability >= CC_TURING) {
|
4687
5097
|
mmq_x = MMQ_X_Q2_K_AMPERE;
|
4688
5098
|
mmq_y = MMQ_Y_Q2_K_AMPERE;
|
4689
5099
|
nwarps = NWARPS_Q2_K_AMPERE;
|
@@ -4722,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4722
5132
|
const int compute_capability = g_compute_capabilities[id];
|
4723
5133
|
|
4724
5134
|
int mmq_x, mmq_y, nwarps;
|
4725
|
-
if (compute_capability >=
|
5135
|
+
if (compute_capability >= CC_RDNA2) {
|
5136
|
+
mmq_x = MMQ_X_Q3_K_RDNA2;
|
5137
|
+
mmq_y = MMQ_Y_Q3_K_RDNA2;
|
5138
|
+
nwarps = NWARPS_Q3_K_RDNA2;
|
5139
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5140
|
+
mmq_x = MMQ_X_Q3_K_RDNA1;
|
5141
|
+
mmq_y = MMQ_Y_Q3_K_RDNA1;
|
5142
|
+
nwarps = NWARPS_Q3_K_RDNA1;
|
5143
|
+
} else if (compute_capability >= CC_TURING) {
|
4726
5144
|
mmq_x = MMQ_X_Q3_K_AMPERE;
|
4727
5145
|
mmq_y = MMQ_Y_Q3_K_AMPERE;
|
4728
5146
|
nwarps = NWARPS_Q3_K_AMPERE;
|
@@ -4760,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
|
4760
5178
|
const int compute_capability = g_compute_capabilities[id];
|
4761
5179
|
|
4762
5180
|
int mmq_x, mmq_y, nwarps;
|
4763
|
-
if (compute_capability >=
|
5181
|
+
if (compute_capability >= CC_RDNA2) {
|
5182
|
+
mmq_x = MMQ_X_Q4_K_RDNA2;
|
5183
|
+
mmq_y = MMQ_Y_Q4_K_RDNA2;
|
5184
|
+
nwarps = NWARPS_Q4_K_RDNA2;
|
5185
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5186
|
+
mmq_x = MMQ_X_Q4_K_RDNA1;
|
5187
|
+
mmq_y = MMQ_Y_Q4_K_RDNA1;
|
5188
|
+
nwarps = NWARPS_Q4_K_RDNA1;
|
5189
|
+
} else if (compute_capability >= CC_TURING) {
|
4764
5190
|
mmq_x = MMQ_X_Q4_K_AMPERE;
|
4765
5191
|
mmq_y = MMQ_Y_Q4_K_AMPERE;
|
4766
5192
|
nwarps = NWARPS_Q4_K_AMPERE;
|
@@ -4797,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
|
4797
5223
|
const int compute_capability = g_compute_capabilities[id];
|
4798
5224
|
|
4799
5225
|
int mmq_x, mmq_y, nwarps;
|
4800
|
-
if (compute_capability >=
|
5226
|
+
if (compute_capability >= CC_RDNA2) {
|
5227
|
+
mmq_x = MMQ_X_Q5_K_RDNA2;
|
5228
|
+
mmq_y = MMQ_Y_Q5_K_RDNA2;
|
5229
|
+
nwarps = NWARPS_Q5_K_RDNA2;
|
5230
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5231
|
+
mmq_x = MMQ_X_Q5_K_RDNA1;
|
5232
|
+
mmq_y = MMQ_Y_Q5_K_RDNA1;
|
5233
|
+
nwarps = NWARPS_Q5_K_RDNA1;
|
5234
|
+
} else if (compute_capability >= CC_TURING) {
|
4801
5235
|
mmq_x = MMQ_X_Q5_K_AMPERE;
|
4802
5236
|
mmq_y = MMQ_Y_Q5_K_AMPERE;
|
4803
5237
|
nwarps = NWARPS_Q5_K_AMPERE;
|
@@ -4834,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
|
4834
5268
|
const int compute_capability = g_compute_capabilities[id];
|
4835
5269
|
|
4836
5270
|
int mmq_x, mmq_y, nwarps;
|
4837
|
-
if (compute_capability >=
|
5271
|
+
if (compute_capability >= CC_RDNA2) {
|
5272
|
+
mmq_x = MMQ_X_Q6_K_RDNA2;
|
5273
|
+
mmq_y = MMQ_Y_Q6_K_RDNA2;
|
5274
|
+
nwarps = NWARPS_Q6_K_RDNA2;
|
5275
|
+
} else if (compute_capability >= CC_OFFSET_AMD) {
|
5276
|
+
mmq_x = MMQ_X_Q6_K_RDNA1;
|
5277
|
+
mmq_y = MMQ_Y_Q6_K_RDNA1;
|
5278
|
+
nwarps = NWARPS_Q6_K_RDNA1;
|
5279
|
+
} else if (compute_capability >= CC_TURING) {
|
4838
5280
|
mmq_x = MMQ_X_Q6_K_AMPERE;
|
4839
5281
|
mmq_y = MMQ_Y_Q6_K_AMPERE;
|
4840
5282
|
nwarps = NWARPS_Q6_K_AMPERE;
|
@@ -4924,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|
4924
5366
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4925
5367
|
}
|
4926
5368
|
|
4927
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
4928
|
-
|
4929
|
-
|
4930
|
-
const
|
5369
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
5370
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5371
|
+
GGML_ASSERT(ncols % 4 == 0);
|
5372
|
+
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5373
|
+
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
4931
5374
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4932
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5375
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
4933
5376
|
}
|
4934
5377
|
|
4935
5378
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -5067,25 +5510,30 @@ void ggml_init_cublas() {
|
|
5067
5510
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
5068
5511
|
int64_t total_vram = 0;
|
5069
5512
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
5070
|
-
for (
|
5513
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5071
5514
|
cudaDeviceProp prop;
|
5072
5515
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
5073
|
-
fprintf(stderr, " Device %
|
5516
|
+
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
5074
5517
|
|
5075
5518
|
g_tensor_split[id] = total_vram;
|
5076
5519
|
total_vram += prop.totalGlobalMem;
|
5077
|
-
|
5520
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5521
|
+
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
5522
|
+
#else
|
5078
5523
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
5524
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5079
5525
|
}
|
5080
|
-
for (
|
5526
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5081
5527
|
g_tensor_split[id] /= total_vram;
|
5082
5528
|
}
|
5083
5529
|
|
5084
|
-
for (
|
5085
|
-
CUDA_CHECK(
|
5530
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5531
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
5086
5532
|
|
5087
|
-
// create
|
5088
|
-
|
5533
|
+
// create cuda streams
|
5534
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
5535
|
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
5536
|
+
}
|
5089
5537
|
|
5090
5538
|
// create cublas handle
|
5091
5539
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -5154,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5154
5602
|
if (src->backend == GGML_BACKEND_CPU) {
|
5155
5603
|
kind = cudaMemcpyHostToDevice;
|
5156
5604
|
src_ptr = (char *) src->data;
|
5157
|
-
} else if (src->backend == GGML_BACKEND_GPU) {
|
5605
|
+
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5606
|
+
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5158
5607
|
kind = cudaMemcpyDeviceToDevice;
|
5159
5608
|
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5160
5609
|
int id;
|
@@ -5193,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5193
5642
|
}
|
5194
5643
|
|
5195
5644
|
inline void ggml_cuda_op_add(
|
5196
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5197
|
-
float *
|
5198
|
-
cudaStream_t & cudaStream_main){
|
5199
|
-
|
5200
|
-
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
5201
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5202
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5645
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5646
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5203
5647
|
|
5204
|
-
|
5205
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5648
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5206
5649
|
|
5207
5650
|
const int64_t ne10 = src1->ne[0];
|
5208
5651
|
const int64_t ne11 = src1->ne[1];
|
5209
5652
|
|
5210
|
-
// compute
|
5211
5653
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
5212
|
-
add_f32_cuda(
|
5654
|
+
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5213
5655
|
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
5214
|
-
add_f16_f32_f16_cuda((half *)
|
5656
|
+
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
|
5215
5657
|
} else {
|
5216
5658
|
GGML_ASSERT(false);
|
5217
5659
|
}
|
5218
5660
|
|
5219
5661
|
(void) src1;
|
5220
5662
|
(void) dst;
|
5221
|
-
(void) src0_ddq_i;
|
5222
|
-
(void) i02;
|
5223
|
-
(void) i1;
|
5224
5663
|
}
|
5225
5664
|
|
5226
5665
|
inline void ggml_cuda_op_mul(
|
5227
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5228
|
-
float *
|
5229
|
-
cudaStream_t & cudaStream_main){
|
5230
|
-
|
5231
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5232
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5233
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5666
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5667
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5234
5668
|
|
5235
|
-
|
5236
|
-
|
5669
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5670
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
5671
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5237
5672
|
|
5238
5673
|
const int64_t ne10 = src1->ne[0];
|
5239
5674
|
const int64_t ne11 = src1->ne[1];
|
5240
5675
|
|
5241
|
-
mul_f32_cuda(
|
5676
|
+
mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
|
5242
5677
|
|
5243
5678
|
(void) dst;
|
5244
|
-
(void) src0_ddq_i;
|
5245
|
-
(void) i02;
|
5246
|
-
(void) i1;
|
5247
5679
|
}
|
5248
5680
|
|
5249
5681
|
inline void ggml_cuda_op_gelu(
|
5250
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5251
|
-
float *
|
5252
|
-
cudaStream_t & cudaStream_main){
|
5253
|
-
|
5254
|
-
GGML_ASSERT(src0_ddf_i != nullptr);
|
5255
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5682
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5683
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5256
5684
|
|
5257
|
-
|
5258
|
-
|
5685
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5686
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5259
5687
|
|
5260
|
-
|
5261
|
-
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5688
|
+
gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5262
5689
|
|
5263
5690
|
(void) src1;
|
5264
5691
|
(void) dst;
|
5265
|
-
(void)
|
5266
|
-
(void) src1_ddf_i;
|
5267
|
-
(void) i02;
|
5268
|
-
(void) i1;
|
5692
|
+
(void) src1_dd;
|
5269
5693
|
}
|
5270
5694
|
|
5271
5695
|
inline void ggml_cuda_op_silu(
|
5272
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5273
|
-
float *
|
5274
|
-
cudaStream_t & cudaStream_main){
|
5696
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5697
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5275
5698
|
|
5276
|
-
GGML_ASSERT(
|
5277
|
-
GGML_ASSERT(
|
5699
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5700
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5278
5701
|
|
5279
|
-
|
5280
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5281
|
-
|
5282
|
-
// compute
|
5283
|
-
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
5702
|
+
silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
5284
5703
|
|
5285
5704
|
(void) src1;
|
5286
5705
|
(void) dst;
|
5287
|
-
(void)
|
5288
|
-
(void) src1_ddf_i;
|
5289
|
-
(void) i02;
|
5290
|
-
(void) i1;
|
5706
|
+
(void) src1_dd;
|
5291
5707
|
}
|
5292
5708
|
|
5293
5709
|
inline void ggml_cuda_op_norm(
|
5294
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5295
|
-
float *
|
5296
|
-
cudaStream_t & cudaStream_main){
|
5710
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5711
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5297
5712
|
|
5298
|
-
GGML_ASSERT(
|
5299
|
-
GGML_ASSERT(
|
5713
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5714
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5300
5715
|
|
5301
5716
|
const int64_t ne00 = src0->ne[0];
|
5302
|
-
const int64_t
|
5717
|
+
const int64_t nrows = ggml_nrows(src0);
|
5303
5718
|
|
5304
|
-
|
5305
|
-
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
5719
|
+
norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5306
5720
|
|
5307
5721
|
(void) src1;
|
5308
5722
|
(void) dst;
|
5309
|
-
(void)
|
5310
|
-
(void) src1_ddf_i;
|
5311
|
-
(void) i02;
|
5312
|
-
(void) i1;
|
5723
|
+
(void) src1_dd;
|
5313
5724
|
}
|
5314
5725
|
|
5315
5726
|
inline void ggml_cuda_op_rms_norm(
|
5316
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5317
|
-
float *
|
5318
|
-
cudaStream_t & cudaStream_main){
|
5727
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5728
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5319
5729
|
|
5320
|
-
GGML_ASSERT(
|
5321
|
-
GGML_ASSERT(
|
5730
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
5731
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5322
5732
|
|
5323
5733
|
const int64_t ne00 = src0->ne[0];
|
5324
|
-
const int64_t
|
5734
|
+
const int64_t nrows = ggml_nrows(src0);
|
5325
5735
|
|
5326
5736
|
float eps;
|
5327
5737
|
memcpy(&eps, dst->op_params, sizeof(float));
|
5328
5738
|
|
5329
|
-
|
5330
|
-
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
5739
|
+
rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
5331
5740
|
|
5332
5741
|
(void) src1;
|
5333
5742
|
(void) dst;
|
5334
|
-
(void)
|
5335
|
-
(void) src1_ddf_i;
|
5336
|
-
(void) i02;
|
5337
|
-
(void) i1;
|
5743
|
+
(void) src1_dd;
|
5338
5744
|
}
|
5339
5745
|
|
5340
5746
|
inline void ggml_cuda_op_mul_mat_q(
|
5341
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5342
|
-
|
5343
|
-
cudaStream_t &
|
5344
|
-
|
5345
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5346
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5347
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5747
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5748
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5749
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5348
5750
|
|
5349
5751
|
const int64_t ne00 = src0->ne[0];
|
5350
5752
|
|
5351
5753
|
const int64_t ne10 = src1->ne[0];
|
5352
|
-
const int64_t ne11 = src1->ne[1];
|
5353
5754
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
5354
5755
|
|
5355
5756
|
const int64_t ne0 = dst->ne[0];
|
5356
5757
|
|
5357
|
-
const int64_t
|
5758
|
+
const int64_t row_diff = row_high - row_low;
|
5358
5759
|
|
5359
5760
|
int id;
|
5360
5761
|
CUDA_CHECK(cudaGetDevice(&id));
|
5361
5762
|
|
5362
5763
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5363
5764
|
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
|
5364
|
-
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
5365
|
-
|
5366
|
-
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
5367
|
-
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5368
|
-
size_t as;
|
5369
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
|
5370
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
|
5765
|
+
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5371
5766
|
|
5372
5767
|
switch (src0->type) {
|
5373
5768
|
case GGML_TYPE_Q4_0:
|
5374
|
-
ggml_mul_mat_q4_0_q8_1_cuda(
|
5769
|
+
ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5375
5770
|
break;
|
5376
5771
|
case GGML_TYPE_Q4_1:
|
5377
|
-
ggml_mul_mat_q4_1_q8_1_cuda(
|
5772
|
+
ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5378
5773
|
break;
|
5379
5774
|
case GGML_TYPE_Q5_0:
|
5380
|
-
ggml_mul_mat_q5_0_q8_1_cuda(
|
5775
|
+
ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5381
5776
|
break;
|
5382
5777
|
case GGML_TYPE_Q5_1:
|
5383
|
-
ggml_mul_mat_q5_1_q8_1_cuda(
|
5778
|
+
ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5384
5779
|
break;
|
5385
5780
|
case GGML_TYPE_Q8_0:
|
5386
|
-
ggml_mul_mat_q8_0_q8_1_cuda(
|
5781
|
+
ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5387
5782
|
break;
|
5388
5783
|
case GGML_TYPE_Q2_K:
|
5389
|
-
ggml_mul_mat_q2_K_q8_1_cuda(
|
5784
|
+
ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5390
5785
|
break;
|
5391
5786
|
case GGML_TYPE_Q3_K:
|
5392
|
-
ggml_mul_mat_q3_K_q8_1_cuda(
|
5787
|
+
ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5393
5788
|
break;
|
5394
5789
|
case GGML_TYPE_Q4_K:
|
5395
|
-
ggml_mul_mat_q4_K_q8_1_cuda(
|
5790
|
+
ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5396
5791
|
break;
|
5397
5792
|
case GGML_TYPE_Q5_K:
|
5398
|
-
ggml_mul_mat_q5_K_q8_1_cuda(
|
5793
|
+
ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5399
5794
|
break;
|
5400
5795
|
case GGML_TYPE_Q6_K:
|
5401
|
-
ggml_mul_mat_q6_K_q8_1_cuda(
|
5796
|
+
ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
|
5402
5797
|
break;
|
5403
5798
|
default:
|
5404
5799
|
GGML_ASSERT(false);
|
5405
5800
|
break;
|
5406
5801
|
}
|
5407
5802
|
|
5408
|
-
ggml_cuda_pool_free(src1_q8_1, as);
|
5409
|
-
|
5410
5803
|
(void) src1;
|
5411
5804
|
(void) dst;
|
5412
|
-
(void)
|
5413
|
-
(void) i02;
|
5414
|
-
(void) i1;
|
5805
|
+
(void) src1_ddf_i;
|
5415
5806
|
}
|
5416
5807
|
|
5417
5808
|
static int64_t get_row_rounding(ggml_type type) {
|
5418
|
-
|
5419
|
-
|
5420
|
-
|
5421
|
-
|
5422
|
-
|
5809
|
+
int64_t min_compute_capability = INT_MAX;
|
5810
|
+
int64_t max_compute_capability = INT_MIN;
|
5811
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
5812
|
+
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
5813
|
+
if (min_compute_capability > g_compute_capabilities[id]) {
|
5814
|
+
min_compute_capability = g_compute_capabilities[id];
|
5815
|
+
}
|
5816
|
+
if (max_compute_capability < g_compute_capabilities[id]) {
|
5817
|
+
max_compute_capability = g_compute_capabilities[id];
|
5818
|
+
}
|
5423
5819
|
}
|
5424
5820
|
}
|
5425
5821
|
|
5822
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5823
|
+
switch(type) {
|
5824
|
+
case GGML_TYPE_Q4_0:
|
5825
|
+
case GGML_TYPE_Q4_1:
|
5826
|
+
case GGML_TYPE_Q5_0:
|
5827
|
+
case GGML_TYPE_Q5_1:
|
5828
|
+
case GGML_TYPE_Q8_0:
|
5829
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5830
|
+
case GGML_TYPE_F16:
|
5831
|
+
return 1;
|
5832
|
+
case GGML_TYPE_Q2_K:
|
5833
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
5834
|
+
case GGML_TYPE_Q3_K:
|
5835
|
+
return min_compute_capability < CC_RDNA2 ? 128 : 64;
|
5836
|
+
case GGML_TYPE_Q4_K:
|
5837
|
+
case GGML_TYPE_Q5_K:
|
5838
|
+
case GGML_TYPE_Q6_K:
|
5839
|
+
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
5840
|
+
default:
|
5841
|
+
GGML_ASSERT(false);
|
5842
|
+
}
|
5843
|
+
#else
|
5426
5844
|
switch(type) {
|
5427
5845
|
case GGML_TYPE_Q4_0:
|
5428
5846
|
case GGML_TYPE_Q4_1:
|
@@ -5443,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
5443
5861
|
default:
|
5444
5862
|
GGML_ASSERT(false);
|
5445
5863
|
}
|
5864
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
5446
5865
|
}
|
5447
5866
|
|
5448
|
-
inline void
|
5449
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5450
|
-
|
5451
|
-
cudaStream_t &
|
5452
|
-
|
5453
|
-
GGML_ASSERT(src0_ddq_i != nullptr);
|
5454
|
-
GGML_ASSERT(src1_ddf_i != nullptr);
|
5455
|
-
GGML_ASSERT(dst_ddf_i != nullptr);
|
5867
|
+
inline void ggml_cuda_op_mul_mat_vec_q(
|
5868
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5869
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5870
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5456
5871
|
|
5457
5872
|
const int64_t ne00 = src0->ne[0];
|
5458
|
-
const int64_t
|
5873
|
+
const int64_t row_diff = row_high - row_low;
|
5459
5874
|
|
5460
|
-
|
5461
|
-
|
5462
|
-
|
5463
|
-
|
5464
|
-
|
5465
|
-
|
5875
|
+
switch (src0->type) {
|
5876
|
+
case GGML_TYPE_Q4_0:
|
5877
|
+
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5878
|
+
break;
|
5879
|
+
case GGML_TYPE_Q4_1:
|
5880
|
+
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5881
|
+
break;
|
5882
|
+
case GGML_TYPE_Q5_0:
|
5883
|
+
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5884
|
+
break;
|
5885
|
+
case GGML_TYPE_Q5_1:
|
5886
|
+
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5887
|
+
break;
|
5888
|
+
case GGML_TYPE_Q8_0:
|
5889
|
+
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5890
|
+
break;
|
5891
|
+
case GGML_TYPE_Q2_K:
|
5892
|
+
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5893
|
+
break;
|
5894
|
+
case GGML_TYPE_Q3_K:
|
5895
|
+
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5896
|
+
break;
|
5897
|
+
case GGML_TYPE_Q4_K:
|
5898
|
+
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5899
|
+
break;
|
5900
|
+
case GGML_TYPE_Q5_K:
|
5901
|
+
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5902
|
+
break;
|
5903
|
+
case GGML_TYPE_Q6_K:
|
5904
|
+
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
5905
|
+
break;
|
5906
|
+
default:
|
5907
|
+
GGML_ASSERT(false);
|
5908
|
+
break;
|
5909
|
+
}
|
5466
5910
|
|
5467
|
-
|
5468
|
-
|
5469
|
-
|
5470
|
-
|
5471
|
-
|
5472
|
-
|
5473
|
-
#if QK_K == 256
|
5474
|
-
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
|
5475
|
-
src0->type == GGML_TYPE_Q2_K ||
|
5476
|
-
src0->type == GGML_TYPE_Q3_K ||
|
5477
|
-
src0->type == GGML_TYPE_Q4_K ||
|
5478
|
-
src0->type == GGML_TYPE_Q5_K ||
|
5479
|
-
src0->type == GGML_TYPE_Q6_K;
|
5480
|
-
#endif // QK_K == 256
|
5481
|
-
|
5482
|
-
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
|
5483
|
-
#endif
|
5911
|
+
(void) src1;
|
5912
|
+
(void) dst;
|
5913
|
+
(void) src1_ddf_i;
|
5914
|
+
(void) src1_ncols;
|
5915
|
+
(void) src1_padded_row_size;
|
5916
|
+
}
|
5484
5917
|
|
5485
|
-
|
5486
|
-
|
5487
|
-
|
5488
|
-
|
5489
|
-
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
5490
|
-
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
|
5491
|
-
|
5492
|
-
switch (src0->type) {
|
5493
|
-
case GGML_TYPE_Q4_0:
|
5494
|
-
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5495
|
-
break;
|
5496
|
-
case GGML_TYPE_Q4_1:
|
5497
|
-
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5498
|
-
break;
|
5499
|
-
case GGML_TYPE_Q5_0:
|
5500
|
-
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5501
|
-
break;
|
5502
|
-
case GGML_TYPE_Q5_1:
|
5503
|
-
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5504
|
-
break;
|
5505
|
-
case GGML_TYPE_Q8_0:
|
5506
|
-
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5507
|
-
break;
|
5508
|
-
case GGML_TYPE_Q2_K:
|
5509
|
-
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5510
|
-
break;
|
5511
|
-
case GGML_TYPE_Q3_K:
|
5512
|
-
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5513
|
-
break;
|
5514
|
-
case GGML_TYPE_Q4_K:
|
5515
|
-
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5516
|
-
break;
|
5517
|
-
case GGML_TYPE_Q5_K:
|
5518
|
-
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5519
|
-
break;
|
5520
|
-
case GGML_TYPE_Q6_K:
|
5521
|
-
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
5522
|
-
break;
|
5523
|
-
default:
|
5524
|
-
GGML_ASSERT(false);
|
5525
|
-
break;
|
5526
|
-
}
|
5918
|
+
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
5919
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5920
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
5921
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5527
5922
|
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5923
|
+
const int64_t ne00 = src0->ne[0];
|
5924
|
+
const int64_t row_diff = row_high - row_low;
|
5925
|
+
|
5926
|
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
5531
5927
|
#ifdef GGML_CUDA_F16
|
5532
|
-
|
5533
|
-
|
5534
|
-
|
5535
|
-
|
5536
|
-
|
5537
|
-
|
5538
|
-
|
5539
|
-
|
5540
|
-
|
5541
|
-
|
5542
|
-
|
5543
|
-
|
5544
|
-
|
5928
|
+
size_t ash;
|
5929
|
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
5930
|
+
|
5931
|
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
5932
|
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
5933
|
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
5934
|
+
|
5935
|
+
if (src1_convert_f16) {
|
5936
|
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
5937
|
+
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
5938
|
+
ne00, 1, sizeof(float), 0, 0,
|
5939
|
+
ne00, 1, sizeof(half), 0, 0, stream);
|
5940
|
+
}
|
5545
5941
|
#else
|
5546
|
-
|
5942
|
+
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
5547
5943
|
#endif // GGML_CUDA_F16
|
5548
5944
|
|
5549
|
-
|
5550
|
-
|
5551
|
-
|
5552
|
-
|
5553
|
-
|
5554
|
-
|
5555
|
-
|
5556
|
-
|
5557
|
-
|
5558
|
-
|
5559
|
-
|
5560
|
-
|
5561
|
-
|
5562
|
-
|
5563
|
-
|
5564
|
-
|
5565
|
-
|
5566
|
-
|
5567
|
-
|
5568
|
-
|
5569
|
-
|
5570
|
-
|
5571
|
-
|
5572
|
-
|
5573
|
-
|
5574
|
-
|
5575
|
-
|
5576
|
-
|
5577
|
-
|
5578
|
-
|
5579
|
-
|
5580
|
-
|
5581
|
-
|
5582
|
-
|
5583
|
-
|
5584
|
-
|
5585
|
-
|
5586
|
-
|
5945
|
+
switch (src0->type) {
|
5946
|
+
case GGML_TYPE_Q4_0:
|
5947
|
+
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5948
|
+
break;
|
5949
|
+
case GGML_TYPE_Q4_1:
|
5950
|
+
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5951
|
+
break;
|
5952
|
+
case GGML_TYPE_Q5_0:
|
5953
|
+
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5954
|
+
break;
|
5955
|
+
case GGML_TYPE_Q5_1:
|
5956
|
+
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5957
|
+
break;
|
5958
|
+
case GGML_TYPE_Q8_0:
|
5959
|
+
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5960
|
+
break;
|
5961
|
+
case GGML_TYPE_Q2_K:
|
5962
|
+
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5963
|
+
break;
|
5964
|
+
case GGML_TYPE_Q3_K:
|
5965
|
+
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5966
|
+
break;
|
5967
|
+
case GGML_TYPE_Q4_K:
|
5968
|
+
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5969
|
+
break;
|
5970
|
+
case GGML_TYPE_Q5_K:
|
5971
|
+
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5972
|
+
break;
|
5973
|
+
case GGML_TYPE_Q6_K:
|
5974
|
+
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
5975
|
+
break;
|
5976
|
+
case GGML_TYPE_F16:
|
5977
|
+
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
5978
|
+
break;
|
5979
|
+
default:
|
5980
|
+
GGML_ASSERT(false);
|
5981
|
+
break;
|
5982
|
+
}
|
5587
5983
|
|
5588
5984
|
#ifdef GGML_CUDA_F16
|
5589
|
-
|
5590
|
-
|
5591
|
-
}
|
5592
|
-
#endif // GGML_CUDA_F16
|
5985
|
+
if (src1_convert_f16) {
|
5986
|
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
5593
5987
|
}
|
5988
|
+
#endif // GGML_CUDA_F16
|
5594
5989
|
|
5595
5990
|
(void) src1;
|
5596
5991
|
(void) dst;
|
5597
|
-
(void)
|
5598
|
-
(void)
|
5599
|
-
(void)
|
5992
|
+
(void) src1_ddq_i;
|
5993
|
+
(void) src1_ncols;
|
5994
|
+
(void) src1_padded_row_size;
|
5600
5995
|
}
|
5601
5996
|
|
5602
5997
|
inline void ggml_cuda_op_mul_mat_cublas(
|
5603
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char *
|
5604
|
-
|
5605
|
-
cudaStream_t &
|
5998
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
5999
|
+
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
6000
|
+
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
5606
6001
|
|
5607
|
-
GGML_ASSERT(
|
6002
|
+
GGML_ASSERT(src0_dd_i != nullptr);
|
5608
6003
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
5609
|
-
GGML_ASSERT(
|
6004
|
+
GGML_ASSERT(dst_dd_i != nullptr);
|
5610
6005
|
|
5611
6006
|
const float alpha = 1.0f;
|
5612
6007
|
const float beta = 0.0f;
|
@@ -5614,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
5614
6009
|
const int64_t ne00 = src0->ne[0];
|
5615
6010
|
|
5616
6011
|
const int64_t ne10 = src1->ne[0];
|
5617
|
-
const int64_t ne11 = src1->ne[1];
|
5618
6012
|
|
5619
6013
|
const int64_t ne0 = dst->ne[0];
|
5620
|
-
const int64_t
|
6014
|
+
const int64_t row_diff = row_high - row_low;
|
6015
|
+
|
6016
|
+
float * src0_ddq_as_f32;
|
6017
|
+
size_t src0_as = 0;
|
6018
|
+
|
6019
|
+
if (src0->type != GGML_TYPE_F32) {
|
6020
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6021
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6022
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6023
|
+
}
|
6024
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
5621
6025
|
|
5622
6026
|
int id;
|
5623
6027
|
CUDA_CHECK(cudaGetDevice(&id));
|
5624
6028
|
|
5625
6029
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
5626
6030
|
// ldc == nrows of the matrix that cuBLAS writes into
|
5627
|
-
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 :
|
6031
|
+
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
5628
6032
|
|
5629
|
-
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id],
|
6033
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
5630
6034
|
CUBLAS_CHECK(
|
5631
6035
|
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
5632
|
-
|
6036
|
+
row_diff, src1_ncols, ne10,
|
5633
6037
|
&alpha, src0_ddf_i, ne00,
|
5634
|
-
src1_ddf_i,
|
5635
|
-
&beta,
|
6038
|
+
src1_ddf_i, ne10,
|
6039
|
+
&beta, dst_dd_i, ldc));
|
6040
|
+
|
6041
|
+
if (src0_as > 0) {
|
6042
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6043
|
+
}
|
5636
6044
|
|
5637
6045
|
(void) dst;
|
5638
|
-
(void)
|
5639
|
-
(void)
|
5640
|
-
(void) i1;
|
6046
|
+
(void) src1_ddq_i;
|
6047
|
+
(void) src1_padded_row_size;
|
5641
6048
|
}
|
5642
6049
|
|
5643
6050
|
inline void ggml_cuda_op_rope(
|
5644
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5645
|
-
float *
|
5646
|
-
cudaStream_t & cudaStream_main){
|
6051
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6052
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5647
6053
|
|
5648
|
-
GGML_ASSERT(
|
5649
|
-
GGML_ASSERT(
|
6054
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6055
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5650
6056
|
|
5651
6057
|
const int64_t ne00 = src0->ne[0];
|
5652
6058
|
const int64_t ne01 = src0->ne[1];
|
5653
|
-
const int64_t
|
6059
|
+
const int64_t nrows = ggml_nrows(src0);
|
5654
6060
|
|
5655
6061
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5656
6062
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
@@ -5663,44 +6069,37 @@ inline void ggml_cuda_op_rope(
|
|
5663
6069
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
5664
6070
|
|
5665
6071
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6072
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5666
6073
|
|
5667
6074
|
const bool is_neox = mode & 2;
|
5668
6075
|
const bool is_glm = mode & 4;
|
5669
6076
|
|
5670
6077
|
// compute
|
5671
6078
|
if (is_glm) {
|
5672
|
-
|
5673
|
-
const float id_p = min(p, n_ctx - 2.f);
|
5674
|
-
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5675
|
-
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
6079
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
5676
6080
|
} else if (is_neox) {
|
5677
6081
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5678
|
-
|
5679
|
-
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6082
|
+
rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5680
6083
|
} else {
|
5681
|
-
|
5682
|
-
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
6084
|
+
rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
|
5683
6085
|
}
|
5684
6086
|
|
5685
6087
|
(void) src1;
|
5686
6088
|
(void) dst;
|
5687
|
-
(void)
|
5688
|
-
(void) src1_ddf_i;
|
5689
|
-
(void) i1;
|
6089
|
+
(void) src1_dd;
|
5690
6090
|
}
|
5691
6091
|
|
5692
6092
|
inline void ggml_cuda_op_alibi(
|
5693
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5694
|
-
float *
|
5695
|
-
cudaStream_t & cudaStream_main){
|
6093
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6094
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5696
6095
|
|
5697
|
-
GGML_ASSERT(
|
5698
|
-
GGML_ASSERT(
|
6096
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6097
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5699
6098
|
|
5700
6099
|
const int64_t ne00 = src0->ne[0];
|
5701
6100
|
const int64_t ne01 = src0->ne[1];
|
5702
6101
|
const int64_t ne02 = src0->ne[2];
|
5703
|
-
const int64_t
|
6102
|
+
const int64_t nrows = ggml_nrows(src0);
|
5704
6103
|
|
5705
6104
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5706
6105
|
const int n_head = ((int32_t *) dst->op_params)[1];
|
@@ -5715,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
|
|
5715
6114
|
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5716
6115
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5717
6116
|
|
5718
|
-
|
5719
|
-
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
6117
|
+
alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
5720
6118
|
|
5721
6119
|
(void) src1;
|
5722
|
-
(void)
|
5723
|
-
(void) src1_ddf_i;
|
5724
|
-
(void) i1;
|
6120
|
+
(void) src1_dd;
|
5725
6121
|
}
|
5726
6122
|
|
5727
6123
|
inline void ggml_cuda_op_diag_mask_inf(
|
5728
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5729
|
-
float *
|
5730
|
-
cudaStream_t & cudaStream_main){
|
6124
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6125
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5731
6126
|
|
5732
|
-
GGML_ASSERT(
|
5733
|
-
GGML_ASSERT(
|
6127
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6128
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5734
6129
|
|
5735
6130
|
const int64_t ne00 = src0->ne[0];
|
5736
6131
|
const int64_t ne01 = src0->ne[1];
|
5737
|
-
const
|
6132
|
+
const int nrows0 = ggml_nrows(src0);
|
5738
6133
|
|
5739
6134
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
5740
6135
|
|
5741
|
-
|
5742
|
-
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
6136
|
+
diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
5743
6137
|
|
5744
6138
|
(void) src1;
|
5745
6139
|
(void) dst;
|
5746
|
-
(void)
|
5747
|
-
(void) src1_ddf_i;
|
5748
|
-
(void) i02;
|
5749
|
-
(void) i1;
|
6140
|
+
(void) src1_dd;
|
5750
6141
|
}
|
5751
6142
|
|
5752
6143
|
inline void ggml_cuda_op_soft_max(
|
5753
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5754
|
-
float *
|
5755
|
-
cudaStream_t & cudaStream_main){
|
6144
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6145
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5756
6146
|
|
5757
|
-
GGML_ASSERT(
|
5758
|
-
GGML_ASSERT(
|
6147
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6148
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5759
6149
|
|
5760
6150
|
const int64_t ne00 = src0->ne[0];
|
5761
|
-
const int64_t
|
6151
|
+
const int64_t nrows = ggml_nrows(src0);
|
5762
6152
|
|
5763
|
-
|
5764
|
-
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
6153
|
+
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
5765
6154
|
|
5766
6155
|
(void) src1;
|
5767
6156
|
(void) dst;
|
5768
|
-
(void)
|
5769
|
-
(void) src1_ddf_i;
|
5770
|
-
(void) i02;
|
5771
|
-
(void) i1;
|
6157
|
+
(void) src1_dd;
|
5772
6158
|
}
|
5773
6159
|
|
5774
6160
|
inline void ggml_cuda_op_scale(
|
5775
|
-
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5776
|
-
float *
|
5777
|
-
cudaStream_t & cudaStream_main){
|
6161
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6162
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
5778
6163
|
|
5779
|
-
GGML_ASSERT(
|
5780
|
-
GGML_ASSERT(
|
6164
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6165
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6166
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
5781
6167
|
|
5782
6168
|
const float scale = ((float *) src1->data)[0];
|
5783
6169
|
|
5784
|
-
|
5785
|
-
const int64_t i01_diff = i01_high - i01_low;
|
5786
|
-
|
5787
|
-
// compute
|
5788
|
-
scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
|
6170
|
+
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
5789
6171
|
CUDA_CHECK(cudaGetLastError());
|
5790
6172
|
|
5791
6173
|
(void) src1;
|
5792
6174
|
(void) dst;
|
5793
|
-
(void)
|
5794
|
-
|
5795
|
-
|
5796
|
-
|
6175
|
+
(void) src1_dd;
|
6176
|
+
}
|
6177
|
+
|
6178
|
+
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6179
|
+
const int64_t nrows0 = ggml_nrows(src0);
|
6180
|
+
|
6181
|
+
const bool use_src1 = src1 != nullptr;
|
6182
|
+
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6183
|
+
|
6184
|
+
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6185
|
+
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6186
|
+
|
6187
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6188
|
+
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6189
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6190
|
+
|
6191
|
+
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6192
|
+
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
6193
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
|
6194
|
+
|
6195
|
+
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
|
6196
|
+
|
6197
|
+
// dd = data device
|
6198
|
+
float * src0_ddf = nullptr;
|
6199
|
+
float * src1_ddf = nullptr;
|
6200
|
+
float * dst_ddf = nullptr;
|
6201
|
+
|
6202
|
+
// as = actual size
|
6203
|
+
size_t src0_asf = 0;
|
6204
|
+
size_t src1_asf = 0;
|
6205
|
+
size_t dst_asf = 0;
|
6206
|
+
|
6207
|
+
ggml_cuda_set_device(g_main_device);
|
6208
|
+
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6209
|
+
|
6210
|
+
if (src0_on_device) {
|
6211
|
+
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
6212
|
+
} else {
|
6213
|
+
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
6214
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
6215
|
+
}
|
6216
|
+
|
6217
|
+
if (use_src1 && !src1_stays_on_host) {
|
6218
|
+
if (src1_on_device) {
|
6219
|
+
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6220
|
+
} else {
|
6221
|
+
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
6222
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
6223
|
+
}
|
6224
|
+
}
|
6225
|
+
if (dst_on_device) {
|
6226
|
+
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6227
|
+
} else {
|
6228
|
+
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
6229
|
+
}
|
6230
|
+
|
6231
|
+
// do the computation
|
6232
|
+
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
6233
|
+
CUDA_CHECK(cudaGetLastError());
|
6234
|
+
|
6235
|
+
// copy dst to host if necessary
|
6236
|
+
if (!dst_on_device) {
|
6237
|
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
6238
|
+
}
|
6239
|
+
|
6240
|
+
if (src0_asf > 0) {
|
6241
|
+
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
6242
|
+
}
|
6243
|
+
if (src1_asf > 0) {
|
6244
|
+
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
6245
|
+
}
|
6246
|
+
if (dst_asf > 0) {
|
6247
|
+
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
if (dst->backend == GGML_BACKEND_CPU) {
|
6251
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
6252
|
+
}
|
5797
6253
|
}
|
5798
6254
|
|
5799
|
-
static void
|
5800
|
-
|
6255
|
+
static void ggml_cuda_op_mul_mat(
|
6256
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
|
+
const bool convert_src1_to_q8_1) {
|
6258
|
+
|
5801
6259
|
const int64_t ne00 = src0->ne[0];
|
5802
6260
|
const int64_t ne01 = src0->ne[1];
|
5803
6261
|
const int64_t ne02 = src0->ne[2];
|
5804
6262
|
const int64_t ne03 = src0->ne[3];
|
5805
6263
|
const int64_t nrows0 = ggml_nrows(src0);
|
5806
6264
|
|
5807
|
-
const
|
5808
|
-
const int64_t
|
5809
|
-
const int64_t
|
5810
|
-
const int64_t
|
5811
|
-
const int64_t
|
5812
|
-
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
6265
|
+
const int64_t ne10 = src1->ne[0];
|
6266
|
+
const int64_t ne11 = src1->ne[1];
|
6267
|
+
const int64_t ne12 = src1->ne[2];
|
6268
|
+
const int64_t ne13 = src1->ne[3];
|
6269
|
+
const int64_t nrows1 = ggml_nrows(src1);
|
5813
6270
|
|
5814
6271
|
GGML_ASSERT(ne03 == ne13);
|
5815
6272
|
|
5816
6273
|
const int64_t ne0 = dst->ne[0];
|
5817
6274
|
const int64_t ne1 = dst->ne[1];
|
5818
6275
|
|
5819
|
-
const int nb2
|
5820
|
-
const int nb3
|
6276
|
+
const int nb2 = dst->nb[2];
|
6277
|
+
const int nb3 = dst->nb[3];
|
5821
6278
|
|
5822
6279
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
5823
|
-
GGML_ASSERT(
|
6280
|
+
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
5824
6281
|
|
5825
|
-
|
5826
|
-
const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
5827
|
-
const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
5828
|
-
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
5829
|
-
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
5830
|
-
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
5831
|
-
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
6282
|
+
GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
|
5832
6283
|
|
5833
|
-
const int64_t
|
5834
|
-
const int64_t i03_max = flatten_rows ? 1 : ne03;
|
5835
|
-
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
5836
|
-
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
5837
|
-
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
6284
|
+
const int64_t i02_divisor = ne12 / ne02;
|
5838
6285
|
|
5839
6286
|
const size_t src0_ts = ggml_type_size(src0->type);
|
5840
6287
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
6288
|
+
const size_t q8_1_ts = sizeof(block_q8_1);
|
6289
|
+
const size_t q8_1_bs = QK8_1;
|
5841
6290
|
|
5842
|
-
struct ggml_tensor_extra_gpu * src0_extra =
|
5843
|
-
struct ggml_tensor_extra_gpu * src1_extra =
|
5844
|
-
struct ggml_tensor_extra_gpu *
|
6291
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6292
|
+
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6293
|
+
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
5845
6294
|
|
5846
6295
|
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
5847
6296
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
5848
|
-
const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
|
5849
6297
|
|
5850
|
-
const bool src1_is_contiguous =
|
5851
|
-
const
|
5852
|
-
|
6298
|
+
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
6299
|
+
const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
|
6300
|
+
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
5853
6301
|
|
5854
6302
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6303
|
+
GGML_ASSERT(!(split && ne02 > 1));
|
6304
|
+
GGML_ASSERT(!(split && ne03 > 1));
|
5855
6305
|
GGML_ASSERT(!(split && ne02 < ne12));
|
5856
6306
|
|
5857
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
5858
|
-
|
5859
6307
|
// dd = data device
|
5860
|
-
char *
|
5861
|
-
float *
|
5862
|
-
|
5863
|
-
float *
|
5864
|
-
|
5865
|
-
//
|
5866
|
-
size_t
|
5867
|
-
size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
6308
|
+
char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6309
|
+
float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
|
6310
|
+
char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
|
6311
|
+
float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
6312
|
+
|
6313
|
+
// as = actual size
|
6314
|
+
size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5868
6315
|
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
5869
|
-
size_t
|
6316
|
+
size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
|
6317
|
+
size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
|
5870
6318
|
|
5871
|
-
|
5872
|
-
|
5873
|
-
if (split && g_device_count > 1) {
|
5874
|
-
CUDA_CHECK(cudaSetDevice(g_main_device));
|
5875
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
5876
|
-
}
|
6319
|
+
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
6320
|
+
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
5877
6321
|
|
5878
|
-
for (
|
5879
|
-
|
5880
|
-
|
5881
|
-
|
5882
|
-
|
5883
|
-
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5884
|
-
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6322
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6323
|
+
// by default, use all rows
|
6324
|
+
row_low[id] = 0;
|
6325
|
+
row_high[id] = ne01;
|
5885
6326
|
|
5886
|
-
|
6327
|
+
// for multi GPU, get the row boundaries from tensor split
|
6328
|
+
// and round to mul_mat_q tile sizes
|
5887
6329
|
if (split) {
|
5888
6330
|
const int64_t rounding = get_row_rounding(src0->type);
|
5889
6331
|
|
5890
|
-
|
5891
|
-
|
6332
|
+
if (id != 0) {
|
6333
|
+
row_low[id] = ne01*g_tensor_split[id];
|
6334
|
+
row_low[id] -= row_low[id] % rounding;
|
6335
|
+
}
|
5892
6336
|
|
5893
|
-
if (id
|
5894
|
-
row_high
|
5895
|
-
|
5896
|
-
row_high = nrows0*g_tensor_split[id + 1];
|
5897
|
-
row_high -= row_high % rounding;
|
6337
|
+
if (id != g_device_count - 1) {
|
6338
|
+
row_high[id] = ne01*g_tensor_split[id + 1];
|
6339
|
+
row_high[id] -= row_high[id] % rounding;
|
5898
6340
|
}
|
5899
|
-
} else {
|
5900
|
-
row_low = 0;
|
5901
|
-
row_high = nrows0*i02_divisor;
|
5902
6341
|
}
|
5903
|
-
|
6342
|
+
}
|
6343
|
+
|
6344
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6345
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
5904
6346
|
continue;
|
5905
6347
|
}
|
5906
6348
|
|
5907
|
-
|
5908
|
-
|
5909
|
-
cudaSetDevice(id);
|
5910
|
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
6349
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6350
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
5911
6351
|
|
5912
|
-
|
5913
|
-
|
5914
|
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
5915
|
-
}
|
6352
|
+
ggml_cuda_set_device(id);
|
6353
|
+
const cudaStream_t stream = g_cudaStreams[id][0];
|
5916
6354
|
|
5917
6355
|
if (src0_on_device && src0_is_contiguous) {
|
5918
|
-
|
5919
|
-
src0_ddf[id] = (float *) src0_extra->data_device[id];
|
5920
|
-
} else {
|
5921
|
-
src0_ddq[id] = (char *) src0_extra->data_device[id];
|
5922
|
-
}
|
6356
|
+
src0_dd[id] = (char *) src0_extra->data_device[id];
|
5923
6357
|
} else {
|
5924
|
-
|
5925
|
-
|
5926
|
-
} else {
|
5927
|
-
src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
|
5928
|
-
}
|
6358
|
+
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6359
|
+
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
5929
6360
|
}
|
5930
6361
|
|
5931
|
-
if (
|
5932
|
-
|
6362
|
+
if (src1_on_device && src1_is_contiguous) {
|
6363
|
+
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
6364
|
+
} else {
|
6365
|
+
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
5933
6366
|
}
|
5934
6367
|
|
5935
|
-
if (
|
5936
|
-
|
5937
|
-
|
5938
|
-
|
5939
|
-
src1_ddf[id]
|
6368
|
+
if (convert_src1_to_q8_1) {
|
6369
|
+
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6370
|
+
|
6371
|
+
if (split && src1_on_device && src1_is_contiguous) {
|
6372
|
+
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6373
|
+
CUDA_CHECK(cudaGetLastError());
|
5940
6374
|
}
|
5941
6375
|
}
|
6376
|
+
|
5942
6377
|
if (dst_on_device) {
|
5943
|
-
|
6378
|
+
dst_dd[id] = (float *) dst_extra->data_device[id];
|
5944
6379
|
} else {
|
5945
|
-
size_t size_dst_ddf = split ?
|
5946
|
-
|
6380
|
+
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
6381
|
+
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
5947
6382
|
}
|
6383
|
+
}
|
5948
6384
|
|
5949
|
-
|
5950
|
-
|
5951
|
-
|
5952
|
-
|
6385
|
+
// if multiple devices are used they need to wait for the main device
|
6386
|
+
// here an event is recorded that signals that the main device has finished calculating the input data
|
6387
|
+
if (split && g_device_count > 1) {
|
6388
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6389
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
6390
|
+
}
|
5953
6391
|
|
5954
|
-
|
6392
|
+
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
6393
|
+
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
6394
|
+
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
6395
|
+
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
5955
6396
|
|
5956
|
-
|
5957
|
-
|
5958
|
-
|
6397
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6398
|
+
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
6399
|
+
continue;
|
6400
|
+
}
|
5959
6401
|
|
5960
|
-
|
5961
|
-
|
5962
|
-
|
5963
|
-
if (i0 < i0_offset_low || i0 > i0_offset_high) {
|
5964
|
-
continue;
|
5965
|
-
}
|
5966
|
-
if (i0 == i0_offset_low) {
|
5967
|
-
i01_low = row_low % rows_per_iter;
|
5968
|
-
}
|
5969
|
-
if (i0 == i0_offset_high) {
|
5970
|
-
i01_high = row_high % rows_per_iter;
|
5971
|
-
}
|
5972
|
-
}
|
6402
|
+
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6403
|
+
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
6404
|
+
const int64_t row_diff = row_high[id] - row_low[id];
|
5973
6405
|
|
5974
|
-
|
5975
|
-
|
5976
|
-
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
5977
|
-
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
5978
|
-
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
5979
|
-
GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
|
6406
|
+
ggml_cuda_set_device(id);
|
6407
|
+
const cudaStream_t stream = g_cudaStreams[id][is];
|
5980
6408
|
|
5981
|
-
|
5982
|
-
|
5983
|
-
|
5984
|
-
|
5985
|
-
|
6409
|
+
// wait for main GPU data if necessary
|
6410
|
+
if (split && (id != g_main_device || is != 0)) {
|
6411
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6412
|
+
}
|
6413
|
+
|
6414
|
+
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
6415
|
+
const int64_t i03 = i0 / ne12;
|
6416
|
+
const int64_t i02 = i0 % ne12;
|
6417
|
+
|
6418
|
+
const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
|
5986
6419
|
|
5987
6420
|
// for split tensors the data begins at i0 == i0_offset_low
|
5988
|
-
char *
|
5989
|
-
float *
|
5990
|
-
|
5991
|
-
float *
|
5992
|
-
|
5993
|
-
// for split tensors the data pointer needs to be rounded down
|
5994
|
-
// to the bin edge for i03, i02 bins beyond the first
|
5995
|
-
if (i0 - i0_offset_low > 0) {
|
5996
|
-
GGML_ASSERT(!flatten_rows);
|
5997
|
-
src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
|
5998
|
-
src0_ddf_i -= (row_low % ne01)*ne00;
|
5999
|
-
dst_ddf_i -= (row_low % ne0)*ne1;
|
6000
|
-
}
|
6421
|
+
char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
|
6422
|
+
float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
|
6423
|
+
char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
|
6424
|
+
float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
|
6001
6425
|
|
6002
6426
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
6003
6427
|
// in that case an offset on dst_ddf_i is needed
|
6004
6428
|
if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
|
6005
|
-
|
6429
|
+
dst_dd_i += row_low[id]; // offset is 0 if no tensor split
|
6006
6430
|
}
|
6007
6431
|
|
6008
6432
|
// copy src0, src1 to device if necessary
|
6009
|
-
if (
|
6010
|
-
if (
|
6011
|
-
|
6012
|
-
|
6013
|
-
|
6014
|
-
|
6015
|
-
|
6016
|
-
GGML_ASSERT(!flatten_rows);
|
6433
|
+
if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
6434
|
+
if (id != g_main_device) {
|
6435
|
+
if (convert_src1_to_q8_1) {
|
6436
|
+
char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
|
6437
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
|
6438
|
+
cudaMemcpyDeviceToDevice, stream));
|
6439
|
+
} else {
|
6017
6440
|
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
6018
|
-
src1_ddf_i_source +=
|
6019
|
-
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source,
|
6020
|
-
cudaMemcpyDeviceToDevice,
|
6441
|
+
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
|
6442
|
+
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
|
6443
|
+
cudaMemcpyDeviceToDevice, stream));
|
6021
6444
|
}
|
6022
|
-
} else if (src1_on_device && !src1_is_contiguous) {
|
6023
|
-
GGML_ASSERT(!split);
|
6024
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
|
6025
|
-
} else {
|
6026
|
-
GGML_ASSERT(false);
|
6027
6445
|
}
|
6446
|
+
} else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
|
6447
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
6448
|
+
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
6449
|
+
} else {
|
6450
|
+
GGML_ASSERT(false);
|
6028
6451
|
}
|
6029
6452
|
|
6030
|
-
if (
|
6031
|
-
|
6032
|
-
|
6033
|
-
} else {
|
6034
|
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
6035
|
-
}
|
6453
|
+
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6454
|
+
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6455
|
+
CUDA_CHECK(cudaGetLastError());
|
6036
6456
|
}
|
6037
6457
|
|
6038
|
-
|
6039
|
-
|
6040
|
-
to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
|
6041
|
-
CUDA_CHECK(cudaGetLastError());
|
6458
|
+
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
6459
|
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
|
6042
6460
|
}
|
6043
6461
|
|
6044
6462
|
// do the computation
|
6045
|
-
op(src0, src1, dst,
|
6463
|
+
op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
|
6464
|
+
row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
|
6046
6465
|
CUDA_CHECK(cudaGetLastError());
|
6047
6466
|
|
6048
6467
|
// copy dst to host or other device if necessary
|
@@ -6064,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
6064
6483
|
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
|
6065
6484
|
// Instead they need to be copied to the correct slice in ne0 = dst row index.
|
6066
6485
|
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
|
6067
|
-
float * dhf_dst_i = (float *) ((char *) dst_off_device +
|
6068
|
-
|
6069
|
-
|
6486
|
+
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6487
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6488
|
+
dhf_dst_i += src1_col_0*ne0 + row_low[id];
|
6489
|
+
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
|
6490
|
+
row_diff*sizeof(float), src1_ncols, kind, stream));
|
6070
6491
|
} else {
|
6071
6492
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
6072
|
-
|
6493
|
+
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
6494
|
+
dhf_dst_i += src1_col_0*ne0;
|
6495
|
+
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
6073
6496
|
}
|
6074
6497
|
}
|
6075
6498
|
|
6076
|
-
//
|
6077
|
-
if (split &&
|
6078
|
-
CUDA_CHECK(cudaEventRecord(src0_extra->events[id],
|
6499
|
+
// add event for the main device to wait on until other device is done
|
6500
|
+
if (split && (id != g_main_device || is != 0)) {
|
6501
|
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
|
6079
6502
|
}
|
6080
6503
|
}
|
6081
6504
|
}
|
6082
6505
|
}
|
6083
6506
|
|
6084
|
-
|
6085
|
-
|
6086
|
-
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
6087
|
-
continue;
|
6088
|
-
}
|
6089
|
-
|
6090
|
-
CUDA_CHECK(cudaSetDevice(id));
|
6507
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6508
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6091
6509
|
|
6092
|
-
|
6093
|
-
|
6094
|
-
|
6095
|
-
if (src0_asf[id] > 0) {
|
6096
|
-
ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
|
6510
|
+
// free buffers again when done
|
6511
|
+
if (src0_as[id] > 0) {
|
6512
|
+
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
6097
6513
|
}
|
6098
6514
|
if (src1_asf[id] > 0) {
|
6099
6515
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
6100
6516
|
}
|
6101
|
-
if (
|
6102
|
-
ggml_cuda_pool_free(
|
6517
|
+
if (src1_asq[id] > 0) {
|
6518
|
+
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
6519
|
+
}
|
6520
|
+
if (dst_as[id] > 0) {
|
6521
|
+
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
6103
6522
|
}
|
6104
6523
|
}
|
6105
6524
|
|
6106
6525
|
// main device waits for all other devices to be finished
|
6107
6526
|
if (split && g_device_count > 1) {
|
6108
|
-
|
6109
|
-
|
6110
|
-
|
6111
|
-
|
6527
|
+
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
6528
|
+
is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
|
6529
|
+
|
6530
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
|
+
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6112
6534
|
}
|
6113
6535
|
}
|
6114
6536
|
}
|
6115
6537
|
|
6116
6538
|
if (dst->backend == GGML_BACKEND_CPU) {
|
6117
|
-
CUDA_CHECK(
|
6539
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6118
6540
|
CUDA_CHECK(cudaDeviceSynchronize());
|
6119
6541
|
}
|
6120
6542
|
}
|
6121
6543
|
|
6122
6544
|
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6123
|
-
|
6124
|
-
// Due to flatten_rows == true this does in practice not make a difference however.
|
6125
|
-
// Better solution would be nice but right now that would require disproportionate changes.
|
6126
|
-
GGML_ASSERT(
|
6127
|
-
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
6128
|
-
src1->type == GGML_TYPE_F32 &&
|
6129
|
-
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
6130
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
6545
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6131
6546
|
}
|
6132
6547
|
|
6133
6548
|
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6134
|
-
|
6135
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
6549
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6136
6550
|
}
|
6137
6551
|
|
6138
6552
|
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6139
|
-
|
6140
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
6553
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6141
6554
|
}
|
6142
6555
|
|
6143
6556
|
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6144
|
-
|
6145
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
6557
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6146
6558
|
}
|
6147
6559
|
|
6148
6560
|
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6149
|
-
|
6150
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
|
6561
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6151
6562
|
}
|
6152
6563
|
|
6153
6564
|
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6154
|
-
|
6155
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
|
6565
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6156
6566
|
}
|
6157
6567
|
|
6158
6568
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
@@ -6186,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6186
6596
|
|
6187
6597
|
const int64_t ne12 = src1->ne[2];
|
6188
6598
|
|
6189
|
-
CUDA_CHECK(
|
6190
|
-
cudaStream_t
|
6599
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6600
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6191
6601
|
|
6192
6602
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6193
6603
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6198,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6198
6608
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6199
6609
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6200
6610
|
|
6201
|
-
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12,
|
6611
|
+
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6202
6612
|
}
|
6203
6613
|
|
6204
6614
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -6217,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6217
6627
|
const int64_t nb01 = src0->nb[1];
|
6218
6628
|
const int64_t nb02 = src0->nb[2];
|
6219
6629
|
|
6220
|
-
CUDA_CHECK(
|
6221
|
-
cudaStream_t
|
6630
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6631
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6222
6632
|
|
6223
6633
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6224
6634
|
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -6229,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6229
6639
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6230
6640
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6231
6641
|
|
6232
|
-
const
|
6233
|
-
const
|
6642
|
+
const int64_t row_stride_x = nb01 / sizeof(half);
|
6643
|
+
const int64_t channel_stride_x = nb02 / sizeof(half);
|
6234
6644
|
|
6235
|
-
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x,
|
6645
|
+
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6236
6646
|
}
|
6237
6647
|
|
6238
6648
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6239
6649
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6240
6650
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6241
6651
|
|
6652
|
+
int64_t min_compute_capability = INT_MAX;
|
6653
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6654
|
+
if (min_compute_capability > g_compute_capabilities[id]
|
6655
|
+
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6656
|
+
min_compute_capability = g_compute_capabilities[id];
|
6657
|
+
}
|
6658
|
+
}
|
6659
|
+
|
6242
6660
|
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6243
6661
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6244
6662
|
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6245
6663
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6246
6664
|
}else if (src0->type == GGML_TYPE_F32) {
|
6247
|
-
|
6665
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6248
6666
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6249
6667
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
6250
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
6251
|
-
} else {
|
6252
|
-
int min_compute_capability = INT_MAX;
|
6253
|
-
for (int id = 0; id < g_device_count; ++id) {
|
6254
|
-
if (min_compute_capability > g_compute_capabilities[id]
|
6255
|
-
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
6256
|
-
min_compute_capability = g_compute_capabilities[id];
|
6257
|
-
}
|
6258
|
-
}
|
6259
6668
|
|
6669
|
+
#ifdef GGML_CUDA_FORCE_DMMV
|
6670
|
+
const bool use_mul_mat_vec_q = false;
|
6671
|
+
#else
|
6672
|
+
const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
6673
|
+
#endif // GGML_CUDA_FORCE_DMMV
|
6674
|
+
|
6675
|
+
if (use_mul_mat_vec_q) {
|
6676
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
|
6677
|
+
} else {
|
6678
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
6679
|
+
}
|
6680
|
+
} else {
|
6260
6681
|
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
6261
|
-
|
6682
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
6262
6683
|
} else {
|
6263
|
-
|
6684
|
+
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6264
6685
|
}
|
6265
6686
|
}
|
6266
6687
|
} else {
|
@@ -6269,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6269
6690
|
}
|
6270
6691
|
|
6271
6692
|
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6272
|
-
|
6273
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
|
6693
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6274
6694
|
}
|
6275
6695
|
|
6276
6696
|
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6299,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6299
6719
|
const int64_t nb11 = src1->nb[1];
|
6300
6720
|
const int64_t nb12 = src1->nb[2];
|
6301
6721
|
|
6302
|
-
CUDA_CHECK(
|
6303
|
-
cudaStream_t
|
6722
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6723
|
+
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6304
6724
|
|
6305
6725
|
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6306
6726
|
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -6310,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6310
6730
|
|
6311
6731
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
6312
6732
|
ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6313
|
-
ne10, ne11, nb10, nb11, nb12,
|
6733
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6314
6734
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
6315
6735
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6316
|
-
ne10, ne11, nb10, nb11, nb12,
|
6736
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6317
6737
|
} else {
|
6318
6738
|
GGML_ASSERT(false);
|
6319
6739
|
}
|
@@ -6327,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6327
6747
|
}
|
6328
6748
|
|
6329
6749
|
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6330
|
-
|
6331
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
6750
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6332
6751
|
}
|
6333
6752
|
|
6334
6753
|
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6335
|
-
|
6336
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
|
6754
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6337
6755
|
}
|
6338
6756
|
|
6339
6757
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6340
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
6758
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6342
|
-
|
6343
|
-
const int mode = ((int32_t *) dst->op_params)[2];
|
6344
|
-
const bool is_glm = mode & 4;
|
6345
|
-
|
6346
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6759
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6347
6760
|
}
|
6348
6761
|
|
6349
6762
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6350
|
-
|
6351
|
-
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6763
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6352
6764
|
}
|
6353
6765
|
|
6354
6766
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -6358,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6358
6770
|
}
|
6359
6771
|
|
6360
6772
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
6361
|
-
|
6773
|
+
const int64_t nrows = ggml_nrows(tensor);
|
6362
6774
|
|
6363
6775
|
const int64_t ne0 = tensor->ne[0];
|
6364
6776
|
|
@@ -6368,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6368
6780
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
6369
6781
|
memset(extra, 0, sizeof(*extra));
|
6370
6782
|
|
6371
|
-
for (
|
6783
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6372
6784
|
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
6373
6785
|
continue;
|
6374
6786
|
}
|
6375
6787
|
|
6376
|
-
|
6788
|
+
ggml_cuda_set_device(id);
|
6377
6789
|
|
6378
|
-
|
6790
|
+
int64_t row_low, row_high;
|
6379
6791
|
if (backend == GGML_BACKEND_GPU) {
|
6380
6792
|
row_low = 0;
|
6381
6793
|
row_high = nrows;
|
@@ -6425,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6425
6837
|
extra->data_device[id] = buf;
|
6426
6838
|
|
6427
6839
|
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
6428
|
-
|
6840
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6841
|
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
6842
|
+
}
|
6429
6843
|
}
|
6430
6844
|
}
|
6431
6845
|
|
@@ -6439,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
6439
6853
|
|
6440
6854
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
6441
6855
|
|
6442
|
-
for (
|
6856
|
+
for (int64_t id = 0; id < g_device_count; ++id) {
|
6443
6857
|
if (extra->data_device[id] != nullptr) {
|
6444
|
-
CUDA_CHECK(
|
6858
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6445
6859
|
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
6446
6860
|
}
|
6447
6861
|
|
6448
|
-
|
6449
|
-
|
6450
|
-
|
6862
|
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
6863
|
+
if (extra->events[id][is] != nullptr) {
|
6864
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6865
|
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
6866
|
+
}
|
6451
6867
|
}
|
6452
6868
|
}
|
6453
6869
|
|
@@ -6499,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6499
6915
|
force_inplace;
|
6500
6916
|
const size_t size = ggml_nbytes(tensor);
|
6501
6917
|
|
6502
|
-
CUDA_CHECK(
|
6918
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6503
6919
|
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6504
6920
|
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6505
6921
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|