llama_cpp 0.5.1 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -31,6 +31,9 @@
31
31
  #define cublasSetStream hipblasSetStream
32
32
  #define cublasSgemm hipblasSgemm
33
33
  #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
35
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
36
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
37
  #define cudaDeviceProp hipDeviceProp_t
35
38
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
39
  #define cudaError_t hipError_t
@@ -61,26 +64,36 @@
61
64
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
65
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
66
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
67
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
68
  #define cudaStream_t hipStream_t
66
69
  #define cudaSuccess hipSuccess
67
70
  #else
68
71
  #include <cuda_runtime.h>
69
72
  #include <cublas_v2.h>
70
73
  #include <cuda_fp16.h>
71
- #endif
74
+ #endif // defined(GGML_USE_HIPBLAS)
72
75
 
73
76
  #include "ggml-cuda.h"
74
77
  #include "ggml.h"
75
78
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
79
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
80
+ #define CC_TURING 700
81
+ #define CC_OFFSET_AMD 1000000
82
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
83
 
81
84
  #if defined(GGML_USE_HIPBLAS)
82
85
  #define __CUDA_ARCH__ 1300
83
86
 
87
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
88
+ defined(__gfx1150__) || defined(__gfx1151__)
89
+ #define RDNA3
90
+ #endif
91
+
92
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
93
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
94
+ #define RDNA2
95
+ #endif
96
+
84
97
  #ifndef __has_builtin
85
98
  #define __has_builtin(x) 0
86
99
  #endif
@@ -132,7 +145,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
132
145
  #endif
133
146
  return c;
134
147
  }
135
- #endif
148
+ #endif // defined(GGML_USE_HIPBLAS)
136
149
 
137
150
  #if defined(_MSC_VER)
138
151
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -144,8 +157,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
144
157
  do { \
145
158
  cudaError_t err_ = (err); \
146
159
  if (err_ != cudaSuccess) { \
147
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
160
+ int id; \
161
+ cudaGetDevice(&id); \
162
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
148
163
  cudaGetErrorString(err_)); \
164
+ fprintf(stderr, "current device: %d\n", id); \
149
165
  exit(1); \
150
166
  } \
151
167
  } while (0)
@@ -155,8 +171,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
155
171
  do { \
156
172
  cublasStatus_t err_ = (err); \
157
173
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
174
+ int id; \
175
+ cudaGetDevice(&id); \
158
176
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
159
177
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
178
+ fprintf(stderr, "current device: %d\n", id); \
160
179
  exit(1); \
161
180
  } \
162
181
  } while (0)
@@ -165,12 +184,21 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
165
184
  do { \
166
185
  cublasStatus_t err_ = (err); \
167
186
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
187
+ int id; \
188
+ cudaGetDevice(&id); \
168
189
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
190
+ fprintf(stderr, "current device: %d\n", id); \
169
191
  exit(1); \
170
192
  } \
171
193
  } while (0)
172
194
  #endif // CUDART_VERSION >= 11
173
195
 
196
+ #if CUDART_VERSION >= 11100
197
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
198
+ #else
199
+ #define GGML_CUDA_ASSUME(x)
200
+ #endif // CUDART_VERSION >= 11100
201
+
174
202
  #ifdef GGML_CUDA_F16
175
203
  typedef half dfloat; // dequantize float
176
204
  typedef half2 dfloat2;
@@ -212,10 +240,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
212
240
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
213
241
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
214
242
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
215
- typedef void (*ggml_cuda_op_t)(
216
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
217
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
218
- cudaStream_t & cudaStream_main);
243
+ typedef void (*ggml_cuda_op_mul_mat_t)(
244
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
245
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
246
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
247
+ typedef void (*ggml_cuda_op_flatten_t)(
248
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
249
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
219
250
 
220
251
  // QK = number of values after dequantization
221
252
  // QR = QK / number of values before dequantization
@@ -396,11 +427,33 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
396
427
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
397
428
  #endif
398
429
 
430
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
431
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
432
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
433
+
434
+ #define MUL_MAT_SRC1_COL_STRIDE 128
435
+
436
+ #define MAX_STREAMS 8
437
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
438
+
399
439
  struct ggml_tensor_extra_gpu {
400
440
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
401
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
441
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
402
442
  };
403
443
 
444
+ // this is faster on Windows
445
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
446
+ inline cudaError_t ggml_cuda_set_device(const int device) {
447
+ int current_device;
448
+ CUDA_CHECK(cudaGetDevice(&current_device));
449
+
450
+ if (device == current_device) {
451
+ return cudaSuccess;
452
+ }
453
+
454
+ return cudaSetDevice(device);
455
+ }
456
+
404
457
  static int g_device_count = -1;
405
458
  static int g_main_device = 0;
406
459
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -413,8 +466,6 @@ static size_t g_scratch_offset = 0;
413
466
 
414
467
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
415
468
 
416
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
417
-
418
469
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
419
470
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
420
471
 
@@ -2107,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2107
2158
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2108
2159
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2109
2160
 
2110
- __builtin_assume(i_offset >= 0);
2111
- __builtin_assume(i_offset < nwarps);
2112
- __builtin_assume(k >= 0);
2113
- __builtin_assume(k < WARP_SIZE);
2161
+ GGML_CUDA_ASSUME(i_offset >= 0);
2162
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2163
+ GGML_CUDA_ASSUME(k >= 0);
2164
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2114
2165
 
2115
2166
  const int kbx = k / QI4_0;
2116
2167
  const int kqsx = k % QI4_0;
@@ -2201,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2201
2252
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2202
2253
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2203
2254
 
2204
- __builtin_assume(i_offset >= 0);
2205
- __builtin_assume(i_offset < nwarps);
2206
- __builtin_assume(k >= 0);
2207
- __builtin_assume(k < WARP_SIZE);
2255
+ GGML_CUDA_ASSUME(i_offset >= 0);
2256
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2257
+ GGML_CUDA_ASSUME(k >= 0);
2258
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2208
2259
 
2209
2260
  const int kbx = k / QI4_1;
2210
2261
  const int kqsx = k % QI4_1;
@@ -2293,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2293
2344
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2294
2345
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2295
2346
 
2296
- __builtin_assume(i_offset >= 0);
2297
- __builtin_assume(i_offset < nwarps);
2298
- __builtin_assume(k >= 0);
2299
- __builtin_assume(k < WARP_SIZE);
2347
+ GGML_CUDA_ASSUME(i_offset >= 0);
2348
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2349
+ GGML_CUDA_ASSUME(k >= 0);
2350
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2300
2351
 
2301
2352
  const int kbx = k / QI5_0;
2302
2353
  const int kqsx = k % QI5_0;
@@ -2407,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2407
2458
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2408
2459
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2409
2460
 
2410
- __builtin_assume(i_offset >= 0);
2411
- __builtin_assume(i_offset < nwarps);
2412
- __builtin_assume(k >= 0);
2413
- __builtin_assume(k < WARP_SIZE);
2461
+ GGML_CUDA_ASSUME(i_offset >= 0);
2462
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2463
+ GGML_CUDA_ASSUME(k >= 0);
2464
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2414
2465
 
2415
2466
  const int kbx = k / QI5_1;
2416
2467
  const int kqsx = k % QI5_1;
@@ -2513,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2513
2564
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2514
2565
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2515
2566
 
2516
- __builtin_assume(i_offset >= 0);
2517
- __builtin_assume(i_offset < nwarps);
2518
- __builtin_assume(k >= 0);
2519
- __builtin_assume(k < WARP_SIZE);
2567
+ GGML_CUDA_ASSUME(i_offset >= 0);
2568
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2569
+ GGML_CUDA_ASSUME(k >= 0);
2570
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2520
2571
 
2521
2572
  const int kbx = k / QI8_0;
2522
2573
  const int kqsx = k % QI8_0;
@@ -2604,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2604
2655
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2605
2656
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2606
2657
 
2607
- __builtin_assume(i_offset >= 0);
2608
- __builtin_assume(i_offset < nwarps);
2609
- __builtin_assume(k >= 0);
2610
- __builtin_assume(k < WARP_SIZE);
2658
+ GGML_CUDA_ASSUME(i_offset >= 0);
2659
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2660
+ GGML_CUDA_ASSUME(k >= 0);
2661
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2611
2662
 
2612
2663
  const int kbx = k / QI2_K;
2613
2664
  const int kqsx = k % QI2_K;
@@ -2725,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2725
2776
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2726
2777
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2727
2778
 
2728
- __builtin_assume(i_offset >= 0);
2729
- __builtin_assume(i_offset < nwarps);
2730
- __builtin_assume(k >= 0);
2731
- __builtin_assume(k < WARP_SIZE);
2779
+ GGML_CUDA_ASSUME(i_offset >= 0);
2780
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2781
+ GGML_CUDA_ASSUME(k >= 0);
2782
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2732
2783
 
2733
2784
  const int kbx = k / QI3_K;
2734
2785
  const int kqsx = k % QI3_K;
@@ -2943,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2943
2994
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2944
2995
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2945
2996
 
2946
- __builtin_assume(i_offset >= 0);
2947
- __builtin_assume(i_offset < nwarps);
2948
- __builtin_assume(k >= 0);
2949
- __builtin_assume(k < WARP_SIZE);
2997
+ GGML_CUDA_ASSUME(i_offset >= 0);
2998
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2999
+ GGML_CUDA_ASSUME(k >= 0);
3000
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2950
3001
 
2951
3002
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2952
3003
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3124,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3124
3175
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3125
3176
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3126
3177
 
3127
- __builtin_assume(i_offset >= 0);
3128
- __builtin_assume(i_offset < nwarps);
3129
- __builtin_assume(k >= 0);
3130
- __builtin_assume(k < WARP_SIZE);
3178
+ GGML_CUDA_ASSUME(i_offset >= 0);
3179
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3180
+ GGML_CUDA_ASSUME(k >= 0);
3181
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3131
3182
 
3132
3183
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3133
3184
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3253,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3253
3304
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3254
3305
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3255
3306
 
3256
- __builtin_assume(i_offset >= 0);
3257
- __builtin_assume(i_offset < nwarps);
3258
- __builtin_assume(k >= 0);
3259
- __builtin_assume(k < WARP_SIZE);
3307
+ GGML_CUDA_ASSUME(i_offset >= 0);
3308
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3309
+ GGML_CUDA_ASSUME(k >= 0);
3310
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3260
3311
 
3261
3312
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3262
3313
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -3444,6 +3495,12 @@ static __device__ __forceinline__ void mul_mat_q(
3444
3495
  }
3445
3496
  }
3446
3497
 
3498
+ #define MMQ_X_Q4_0_RDNA2 64
3499
+ #define MMQ_Y_Q4_0_RDNA2 128
3500
+ #define NWARPS_Q4_0_RDNA2 8
3501
+ #define MMQ_X_Q4_0_RDNA1 64
3502
+ #define MMQ_Y_Q4_0_RDNA1 64
3503
+ #define NWARPS_Q4_0_RDNA1 8
3447
3504
  #define MMQ_X_Q4_0_AMPERE 64
3448
3505
  #define MMQ_Y_Q4_0_AMPERE 128
3449
3506
  #define NWARPS_Q4_0_AMPERE 4
@@ -3451,11 +3508,32 @@ static __device__ __forceinline__ void mul_mat_q(
3451
3508
  #define MMQ_Y_Q4_0_PASCAL 64
3452
3509
  #define NWARPS_Q4_0_PASCAL 8
3453
3510
 
3454
- template <bool need_check> static __global__ void mul_mat_q4_0(
3511
+ template <bool need_check> static __global__ void
3512
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3513
+ #if defined(RDNA3) || defined(RDNA2)
3514
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3515
+ #endif // defined(RDNA3) || defined(RDNA2)
3516
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3517
+ mul_mat_q4_0(
3455
3518
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3456
3519
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3457
3520
 
3458
- #if __CUDA_ARCH__ >= CC_TURING
3521
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3522
+ #if defined(RDNA3) || defined(RDNA2)
3523
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3524
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3525
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3526
+ #else
3527
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3528
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3529
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3530
+ #endif // defined(RDNA3) || defined(RDNA2)
3531
+
3532
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3533
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3534
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3535
+
3536
+ #elif __CUDA_ARCH__ >= CC_TURING
3459
3537
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3460
3538
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3461
3539
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3478,6 +3556,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3478
3556
  #endif // __CUDA_ARCH__ >= CC_TURING
3479
3557
  }
3480
3558
 
3559
+ #define MMQ_X_Q4_1_RDNA2 64
3560
+ #define MMQ_Y_Q4_1_RDNA2 128
3561
+ #define NWARPS_Q4_1_RDNA2 8
3562
+ #define MMQ_X_Q4_1_RDNA1 64
3563
+ #define MMQ_Y_Q4_1_RDNA1 64
3564
+ #define NWARPS_Q4_1_RDNA1 8
3481
3565
  #define MMQ_X_Q4_1_AMPERE 64
3482
3566
  #define MMQ_Y_Q4_1_AMPERE 128
3483
3567
  #define NWARPS_Q4_1_AMPERE 4
@@ -3486,14 +3570,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3486
3570
  #define NWARPS_Q4_1_PASCAL 8
3487
3571
 
3488
3572
  template <bool need_check> static __global__ void
3489
- #if __CUDA_ARCH__ < CC_TURING
3573
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3574
+ #if defined(RDNA3) || defined(RDNA2)
3575
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3576
+ #endif // defined(RDNA3) || defined(RDNA2)
3577
+ #elif __CUDA_ARCH__ < CC_TURING
3490
3578
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3491
3579
  #endif // __CUDA_ARCH__ < CC_TURING
3492
3580
  mul_mat_q4_1(
3493
3581
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3494
3582
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3495
3583
 
3496
- #if __CUDA_ARCH__ >= CC_TURING
3584
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3585
+ #if defined(RDNA3) || defined(RDNA2)
3586
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3587
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3588
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3589
+ #else
3590
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3591
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3592
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3593
+ #endif // defined(RDNA3) || defined(RDNA2)
3594
+
3595
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3596
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3597
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3598
+
3599
+ #elif __CUDA_ARCH__ >= CC_TURING
3497
3600
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3498
3601
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3499
3602
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3516,6 +3619,12 @@ template <bool need_check> static __global__ void
3516
3619
  #endif // __CUDA_ARCH__ >= CC_TURING
3517
3620
  }
3518
3621
 
3622
+ #define MMQ_X_Q5_0_RDNA2 64
3623
+ #define MMQ_Y_Q5_0_RDNA2 128
3624
+ #define NWARPS_Q5_0_RDNA2 8
3625
+ #define MMQ_X_Q5_0_RDNA1 64
3626
+ #define MMQ_Y_Q5_0_RDNA1 64
3627
+ #define NWARPS_Q5_0_RDNA1 8
3519
3628
  #define MMQ_X_Q5_0_AMPERE 128
3520
3629
  #define MMQ_Y_Q5_0_AMPERE 64
3521
3630
  #define NWARPS_Q5_0_AMPERE 4
@@ -3523,11 +3632,32 @@ template <bool need_check> static __global__ void
3523
3632
  #define MMQ_Y_Q5_0_PASCAL 64
3524
3633
  #define NWARPS_Q5_0_PASCAL 8
3525
3634
 
3526
- template <bool need_check> static __global__ void mul_mat_q5_0(
3635
+ template <bool need_check> static __global__ void
3636
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3637
+ #if defined(RDNA3) || defined(RDNA2)
3638
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3639
+ #endif // defined(RDNA3) || defined(RDNA2)
3640
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3641
+ mul_mat_q5_0(
3527
3642
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3528
3643
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3529
3644
 
3530
- #if __CUDA_ARCH__ >= CC_TURING
3645
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3646
+ #if defined(RDNA3) || defined(RDNA2)
3647
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3648
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3649
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3650
+ #else
3651
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3652
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3653
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3654
+ #endif // defined(RDNA3) || defined(RDNA2)
3655
+
3656
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3657
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3658
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3659
+
3660
+ #elif __CUDA_ARCH__ >= CC_TURING
3531
3661
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3532
3662
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3533
3663
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3550,6 +3680,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3550
3680
  #endif // __CUDA_ARCH__ >= CC_TURING
3551
3681
  }
3552
3682
 
3683
+ #define MMQ_X_Q5_1_RDNA2 64
3684
+ #define MMQ_Y_Q5_1_RDNA2 128
3685
+ #define NWARPS_Q5_1_RDNA2 8
3686
+ #define MMQ_X_Q5_1_RDNA1 64
3687
+ #define MMQ_Y_Q5_1_RDNA1 64
3688
+ #define NWARPS_Q5_1_RDNA1 8
3553
3689
  #define MMQ_X_Q5_1_AMPERE 128
3554
3690
  #define MMQ_Y_Q5_1_AMPERE 64
3555
3691
  #define NWARPS_Q5_1_AMPERE 4
@@ -3557,11 +3693,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3557
3693
  #define MMQ_Y_Q5_1_PASCAL 64
3558
3694
  #define NWARPS_Q5_1_PASCAL 8
3559
3695
 
3560
- template <bool need_check> static __global__ void mul_mat_q5_1(
3696
+ template <bool need_check> static __global__ void
3697
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3698
+ #if defined(RDNA3) || defined(RDNA2)
3699
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3700
+ #endif // defined(RDNA3) || defined(RDNA2)
3701
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3702
+ mul_mat_q5_1(
3561
3703
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3562
3704
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3563
3705
 
3564
- #if __CUDA_ARCH__ >= CC_TURING
3706
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3707
+ #if defined(RDNA3) || defined(RDNA2)
3708
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3709
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3710
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3711
+ #else
3712
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3713
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3714
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3715
+ #endif // defined(RDNA3) || defined(RDNA2)
3716
+
3717
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3718
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3719
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3720
+
3721
+ #elif __CUDA_ARCH__ >= CC_TURING
3565
3722
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3566
3723
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3567
3724
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3584,6 +3741,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3584
3741
  #endif // __CUDA_ARCH__ >= CC_TURING
3585
3742
  }
3586
3743
 
3744
+ #define MMQ_X_Q8_0_RDNA2 64
3745
+ #define MMQ_Y_Q8_0_RDNA2 128
3746
+ #define NWARPS_Q8_0_RDNA2 8
3747
+ #define MMQ_X_Q8_0_RDNA1 64
3748
+ #define MMQ_Y_Q8_0_RDNA1 64
3749
+ #define NWARPS_Q8_0_RDNA1 8
3587
3750
  #define MMQ_X_Q8_0_AMPERE 128
3588
3751
  #define MMQ_Y_Q8_0_AMPERE 64
3589
3752
  #define NWARPS_Q8_0_AMPERE 4
@@ -3591,11 +3754,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3591
3754
  #define MMQ_Y_Q8_0_PASCAL 64
3592
3755
  #define NWARPS_Q8_0_PASCAL 8
3593
3756
 
3594
- template <bool need_check> static __global__ void mul_mat_q8_0(
3757
+ template <bool need_check> static __global__ void
3758
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3759
+ #if defined(RDNA3) || defined(RDNA2)
3760
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3761
+ #endif // defined(RDNA3) || defined(RDNA2)
3762
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3763
+ mul_mat_q8_0(
3595
3764
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3596
3765
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3597
3766
 
3598
- #if __CUDA_ARCH__ >= CC_TURING
3767
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3768
+ #if defined(RDNA3) || defined(RDNA2)
3769
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3770
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3771
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3772
+ #else
3773
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3774
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3775
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3776
+ #endif // defined(RDNA3) || defined(RDNA2)
3777
+
3778
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3779
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3780
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3781
+
3782
+ #elif __CUDA_ARCH__ >= CC_TURING
3599
3783
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3600
3784
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3601
3785
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3618,6 +3802,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3618
3802
  #endif // __CUDA_ARCH__ >= CC_TURING
3619
3803
  }
3620
3804
 
3805
+ #define MMQ_X_Q2_K_RDNA2 64
3806
+ #define MMQ_Y_Q2_K_RDNA2 128
3807
+ #define NWARPS_Q2_K_RDNA2 8
3808
+ #define MMQ_X_Q2_K_RDNA1 128
3809
+ #define MMQ_Y_Q2_K_RDNA1 32
3810
+ #define NWARPS_Q2_K_RDNA1 8
3621
3811
  #define MMQ_X_Q2_K_AMPERE 64
3622
3812
  #define MMQ_Y_Q2_K_AMPERE 128
3623
3813
  #define NWARPS_Q2_K_AMPERE 4
@@ -3625,11 +3815,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3625
3815
  #define MMQ_Y_Q2_K_PASCAL 64
3626
3816
  #define NWARPS_Q2_K_PASCAL 8
3627
3817
 
3628
- template <bool need_check> static __global__ void mul_mat_q2_K(
3818
+ template <bool need_check> static __global__ void
3819
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3820
+ #if defined(RDNA3) || defined(RDNA2)
3821
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3822
+ #endif // defined(RDNA3) || defined(RDNA2)
3823
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3824
+ mul_mat_q2_K(
3629
3825
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3630
3826
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3631
3827
 
3632
- #if __CUDA_ARCH__ >= CC_TURING
3828
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3829
+ #if defined(RDNA3) || defined(RDNA2)
3830
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3831
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3832
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3833
+ #else
3834
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3835
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3836
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3837
+ #endif // defined(RDNA3) || defined(RDNA2)
3838
+
3839
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3840
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3841
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3842
+
3843
+ #elif __CUDA_ARCH__ >= CC_TURING
3633
3844
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3634
3845
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3635
3846
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3652,6 +3863,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3652
3863
  #endif // __CUDA_ARCH__ >= CC_TURING
3653
3864
  }
3654
3865
 
3866
+ #define MMQ_X_Q3_K_RDNA2 128
3867
+ #define MMQ_Y_Q3_K_RDNA2 64
3868
+ #define NWARPS_Q3_K_RDNA2 8
3869
+ #define MMQ_X_Q3_K_RDNA1 32
3870
+ #define MMQ_Y_Q3_K_RDNA1 128
3871
+ #define NWARPS_Q3_K_RDNA1 8
3655
3872
  #define MMQ_X_Q3_K_AMPERE 128
3656
3873
  #define MMQ_Y_Q3_K_AMPERE 128
3657
3874
  #define NWARPS_Q3_K_AMPERE 4
@@ -3660,14 +3877,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3660
3877
  #define NWARPS_Q3_K_PASCAL 8
3661
3878
 
3662
3879
  template <bool need_check> static __global__ void
3663
- #if __CUDA_ARCH__ < CC_TURING
3880
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3881
+ #if defined(RDNA3) || defined(RDNA2)
3882
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3883
+ #endif // defined(RDNA3) || defined(RDNA2)
3884
+ #elif __CUDA_ARCH__ < CC_TURING
3664
3885
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3665
3886
  #endif // __CUDA_ARCH__ < CC_TURING
3666
3887
  mul_mat_q3_K(
3667
3888
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3668
3889
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3669
3890
 
3670
- #if __CUDA_ARCH__ >= CC_TURING
3891
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3892
+ #if defined(RDNA3) || defined(RDNA2)
3893
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3894
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3895
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3896
+ #else
3897
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3898
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3899
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3900
+ #endif // defined(RDNA3) || defined(RDNA2)
3901
+
3902
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3903
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3904
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3905
+
3906
+ #elif __CUDA_ARCH__ >= CC_TURING
3671
3907
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3672
3908
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3673
3909
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3690,6 +3926,12 @@ template <bool need_check> static __global__ void
3690
3926
  #endif // __CUDA_ARCH__ >= CC_TURING
3691
3927
  }
3692
3928
 
3929
+ #define MMQ_X_Q4_K_RDNA2 64
3930
+ #define MMQ_Y_Q4_K_RDNA2 128
3931
+ #define NWARPS_Q4_K_RDNA2 8
3932
+ #define MMQ_X_Q4_K_RDNA1 32
3933
+ #define MMQ_Y_Q4_K_RDNA1 64
3934
+ #define NWARPS_Q4_K_RDNA1 8
3693
3935
  #define MMQ_X_Q4_K_AMPERE 64
3694
3936
  #define MMQ_Y_Q4_K_AMPERE 128
3695
3937
  #define NWARPS_Q4_K_AMPERE 4
@@ -3698,14 +3940,33 @@ template <bool need_check> static __global__ void
3698
3940
  #define NWARPS_Q4_K_PASCAL 8
3699
3941
 
3700
3942
  template <bool need_check> static __global__ void
3701
- #if __CUDA_ARCH__ < CC_TURING
3943
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3944
+ #if defined(RDNA3) || defined(RDNA2)
3945
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3946
+ #endif // defined(RDNA3) || defined(RDNA2)
3947
+ #elif __CUDA_ARCH__ < CC_TURING
3702
3948
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3703
3949
  #endif // __CUDA_ARCH__ < CC_TURING
3704
3950
  mul_mat_q4_K(
3705
3951
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3706
3952
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3707
3953
 
3708
- #if __CUDA_ARCH__ >= CC_TURING
3954
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3955
+ #if defined(RDNA3) || defined(RDNA2)
3956
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3957
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3958
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3959
+ #else
3960
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3961
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3962
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3963
+ #endif // defined(RDNA3) || defined(RDNA2)
3964
+
3965
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3966
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3967
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
+
3969
+ #elif __CUDA_ARCH__ >= CC_TURING
3709
3970
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3710
3971
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3711
3972
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3728,6 +3989,12 @@ template <bool need_check> static __global__ void
3728
3989
  #endif // __CUDA_ARCH__ >= CC_TURING
3729
3990
  }
3730
3991
 
3992
+ #define MMQ_X_Q5_K_RDNA2 64
3993
+ #define MMQ_Y_Q5_K_RDNA2 128
3994
+ #define NWARPS_Q5_K_RDNA2 8
3995
+ #define MMQ_X_Q5_K_RDNA1 32
3996
+ #define MMQ_Y_Q5_K_RDNA1 64
3997
+ #define NWARPS_Q5_K_RDNA1 8
3731
3998
  #define MMQ_X_Q5_K_AMPERE 64
3732
3999
  #define MMQ_Y_Q5_K_AMPERE 128
3733
4000
  #define NWARPS_Q5_K_AMPERE 4
@@ -3735,11 +4002,32 @@ template <bool need_check> static __global__ void
3735
4002
  #define MMQ_Y_Q5_K_PASCAL 64
3736
4003
  #define NWARPS_Q5_K_PASCAL 8
3737
4004
 
3738
- template <bool need_check> static __global__ void mul_mat_q5_K(
4005
+ template <bool need_check> static __global__ void
4006
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4007
+ #if defined(RDNA3) || defined(RDNA2)
4008
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
4009
+ #endif // defined(RDNA3) || defined(RDNA2)
4010
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4011
+ mul_mat_q5_K(
3739
4012
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3740
4013
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3741
4014
 
3742
- #if __CUDA_ARCH__ >= CC_TURING
4015
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4016
+ #if defined(RDNA3) || defined(RDNA2)
4017
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4018
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4019
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4020
+ #else
4021
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4022
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4023
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4024
+ #endif // defined(RDNA3) || defined(RDNA2)
4025
+
4026
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4027
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4028
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4029
+
4030
+ #elif __CUDA_ARCH__ >= CC_TURING
3743
4031
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3744
4032
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3745
4033
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3762,6 +4050,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3762
4050
  #endif // __CUDA_ARCH__ >= CC_TURING
3763
4051
  }
3764
4052
 
4053
+ #define MMQ_X_Q6_K_RDNA2 64
4054
+ #define MMQ_Y_Q6_K_RDNA2 128
4055
+ #define NWARPS_Q6_K_RDNA2 8
4056
+ #define MMQ_X_Q6_K_RDNA1 32
4057
+ #define MMQ_Y_Q6_K_RDNA1 64
4058
+ #define NWARPS_Q6_K_RDNA1 8
3765
4059
  #define MMQ_X_Q6_K_AMPERE 64
3766
4060
  #define MMQ_Y_Q6_K_AMPERE 64
3767
4061
  #define NWARPS_Q6_K_AMPERE 4
@@ -3770,14 +4064,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3770
4064
  #define NWARPS_Q6_K_PASCAL 8
3771
4065
 
3772
4066
  template <bool need_check> static __global__ void
3773
- #if __CUDA_ARCH__ < CC_TURING
4067
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4068
+ #if defined(RDNA3) || defined(RDNA2)
4069
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4070
+ #endif // defined(RDNA3) || defined(RDNA2)
4071
+ #elif __CUDA_ARCH__ < CC_TURING
3774
4072
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3775
4073
  #endif // __CUDA_ARCH__ < CC_TURING
3776
4074
  mul_mat_q6_K(
3777
4075
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3778
4076
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3779
4077
 
3780
- #if __CUDA_ARCH__ >= CC_TURING
4078
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4079
+ #if defined(RDNA3) || defined(RDNA2)
4080
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4081
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4082
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4083
+ #else
4084
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4085
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4086
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4087
+ #endif // defined(RDNA3) || defined(RDNA2)
4088
+
4089
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4090
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4091
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4092
+
4093
+ #elif __CUDA_ARCH__ >= CC_TURING
3781
4094
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3782
4095
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3783
4096
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4086,7 +4399,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4086
4399
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4087
4400
  }
4088
4401
 
4089
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4402
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4403
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4090
4404
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4091
4405
  const int half_n_dims = ncols/4;
4092
4406
 
@@ -4098,8 +4412,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4098
4412
  const int i = row*ncols + col;
4099
4413
 
4100
4414
  const float col_theta_scale = powf(theta_scale, col);
4415
+ const float p = p0 + p_delta*(row/p_delta_rows);
4101
4416
 
4102
- const float theta = p*col_theta_scale;
4417
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4103
4418
  const float sin_theta = sinf(theta);
4104
4419
  const float cos_theta = cosf(theta);
4105
4420
 
@@ -4109,7 +4424,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4109
4424
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4110
4425
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4111
4426
 
4112
- const float block_theta = block_p*col_theta_scale;
4427
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4113
4428
  const float sin_block_theta = sinf(block_theta);
4114
4429
  const float cos_block_theta = cosf(block_theta);
4115
4430
 
@@ -4558,7 +4873,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4558
4873
  const int compute_capability = g_compute_capabilities[id];
4559
4874
 
4560
4875
  int mmq_x, mmq_y, nwarps;
4561
- if (compute_capability >= CC_TURING) {
4876
+ if (compute_capability >= CC_RDNA2) {
4877
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4878
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4879
+ nwarps = NWARPS_Q4_0_RDNA2;
4880
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4881
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4882
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4883
+ nwarps = NWARPS_Q4_0_RDNA1;
4884
+ } else if (compute_capability >= CC_TURING) {
4562
4885
  mmq_x = MMQ_X_Q4_0_AMPERE;
4563
4886
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4564
4887
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4595,7 +4918,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4595
4918
  const int compute_capability = g_compute_capabilities[id];
4596
4919
 
4597
4920
  int mmq_x, mmq_y, nwarps;
4598
- if (compute_capability >= CC_TURING) {
4921
+ if (compute_capability >= CC_RDNA2) {
4922
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4923
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4924
+ nwarps = NWARPS_Q4_1_RDNA2;
4925
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4926
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4927
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4928
+ nwarps = NWARPS_Q4_1_RDNA1;
4929
+ } else if (compute_capability >= CC_TURING) {
4599
4930
  mmq_x = MMQ_X_Q4_1_AMPERE;
4600
4931
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4601
4932
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4632,7 +4963,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4632
4963
  const int compute_capability = g_compute_capabilities[id];
4633
4964
 
4634
4965
  int mmq_x, mmq_y, nwarps;
4635
- if (compute_capability >= CC_TURING) {
4966
+ if (compute_capability >= CC_RDNA2) {
4967
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4968
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4969
+ nwarps = NWARPS_Q5_0_RDNA2;
4970
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4971
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4972
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4973
+ nwarps = NWARPS_Q5_0_RDNA1;
4974
+ } else if (compute_capability >= CC_TURING) {
4636
4975
  mmq_x = MMQ_X_Q5_0_AMPERE;
4637
4976
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4638
4977
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4669,7 +5008,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4669
5008
  const int compute_capability = g_compute_capabilities[id];
4670
5009
 
4671
5010
  int mmq_x, mmq_y, nwarps;
4672
- if (compute_capability >= CC_TURING) {
5011
+ if (compute_capability >= CC_RDNA2) {
5012
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5013
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5014
+ nwarps = NWARPS_Q5_1_RDNA2;
5015
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5016
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5017
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5018
+ nwarps = NWARPS_Q5_1_RDNA1;
5019
+ } else if (compute_capability >= CC_TURING) {
4673
5020
  mmq_x = MMQ_X_Q5_1_AMPERE;
4674
5021
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4675
5022
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4706,7 +5053,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4706
5053
  const int compute_capability = g_compute_capabilities[id];
4707
5054
 
4708
5055
  int mmq_x, mmq_y, nwarps;
4709
- if (compute_capability >= CC_TURING) {
5056
+ if (compute_capability >= CC_RDNA2) {
5057
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5058
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5059
+ nwarps = NWARPS_Q8_0_RDNA2;
5060
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5061
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5062
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5063
+ nwarps = NWARPS_Q8_0_RDNA1;
5064
+ } else if (compute_capability >= CC_TURING) {
4710
5065
  mmq_x = MMQ_X_Q8_0_AMPERE;
4711
5066
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4712
5067
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4743,7 +5098,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4743
5098
  const int compute_capability = g_compute_capabilities[id];
4744
5099
 
4745
5100
  int mmq_x, mmq_y, nwarps;
4746
- if (compute_capability >= CC_TURING) {
5101
+ if (compute_capability >= CC_RDNA2) {
5102
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5103
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5104
+ nwarps = NWARPS_Q2_K_RDNA2;
5105
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5106
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5107
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5108
+ nwarps = NWARPS_Q2_K_RDNA1;
5109
+ } else if (compute_capability >= CC_TURING) {
4747
5110
  mmq_x = MMQ_X_Q2_K_AMPERE;
4748
5111
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4749
5112
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4782,7 +5145,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4782
5145
  const int compute_capability = g_compute_capabilities[id];
4783
5146
 
4784
5147
  int mmq_x, mmq_y, nwarps;
4785
- if (compute_capability >= CC_TURING) {
5148
+ if (compute_capability >= CC_RDNA2) {
5149
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5150
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5151
+ nwarps = NWARPS_Q3_K_RDNA2;
5152
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5153
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5154
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5155
+ nwarps = NWARPS_Q3_K_RDNA1;
5156
+ } else if (compute_capability >= CC_TURING) {
4786
5157
  mmq_x = MMQ_X_Q3_K_AMPERE;
4787
5158
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4788
5159
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4820,7 +5191,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4820
5191
  const int compute_capability = g_compute_capabilities[id];
4821
5192
 
4822
5193
  int mmq_x, mmq_y, nwarps;
4823
- if (compute_capability >= CC_TURING) {
5194
+ if (compute_capability >= CC_RDNA2) {
5195
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5196
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5197
+ nwarps = NWARPS_Q4_K_RDNA2;
5198
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5199
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5200
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5201
+ nwarps = NWARPS_Q4_K_RDNA1;
5202
+ } else if (compute_capability >= CC_TURING) {
4824
5203
  mmq_x = MMQ_X_Q4_K_AMPERE;
4825
5204
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4826
5205
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4857,7 +5236,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4857
5236
  const int compute_capability = g_compute_capabilities[id];
4858
5237
 
4859
5238
  int mmq_x, mmq_y, nwarps;
4860
- if (compute_capability >= CC_TURING) {
5239
+ if (compute_capability >= CC_RDNA2) {
5240
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5241
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5242
+ nwarps = NWARPS_Q5_K_RDNA2;
5243
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5244
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5245
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5246
+ nwarps = NWARPS_Q5_K_RDNA1;
5247
+ } else if (compute_capability >= CC_TURING) {
4861
5248
  mmq_x = MMQ_X_Q5_K_AMPERE;
4862
5249
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4863
5250
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4894,7 +5281,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4894
5281
  const int compute_capability = g_compute_capabilities[id];
4895
5282
 
4896
5283
  int mmq_x, mmq_y, nwarps;
4897
- if (compute_capability >= CC_TURING) {
5284
+ if (compute_capability >= CC_RDNA2) {
5285
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5286
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5287
+ nwarps = NWARPS_Q6_K_RDNA2;
5288
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5289
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5290
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5291
+ nwarps = NWARPS_Q6_K_RDNA1;
5292
+ } else if (compute_capability >= CC_TURING) {
4898
5293
  mmq_x = MMQ_X_Q6_K_AMPERE;
4899
5294
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4900
5295
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4984,12 +5379,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4984
5379
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4985
5380
  }
4986
5381
 
4987
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4988
- GGML_ASSERT(nrows % 4 == 0);
4989
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4990
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5382
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5383
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5384
+ GGML_ASSERT(ncols % 4 == 0);
5385
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5386
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4991
5387
  const dim3 block_nums(num_blocks_x, nrows, 1);
4992
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5388
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4993
5389
  }
4994
5390
 
4995
5391
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5127,25 +5523,30 @@ void ggml_init_cublas() {
5127
5523
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5128
5524
  int64_t total_vram = 0;
5129
5525
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5130
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5131
5527
  cudaDeviceProp prop;
5132
5528
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5529
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5134
5530
 
5135
5531
  g_tensor_split[id] = total_vram;
5136
5532
  total_vram += prop.totalGlobalMem;
5137
-
5533
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5534
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5535
+ #else
5138
5536
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5537
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5139
5538
  }
5140
- for (int id = 0; id < g_device_count; ++id) {
5539
+ for (int64_t id = 0; id < g_device_count; ++id) {
5141
5540
  g_tensor_split[id] /= total_vram;
5142
5541
  }
5143
5542
 
5144
- for (int id = 0; id < g_device_count; ++id) {
5145
- CUDA_CHECK(cudaSetDevice(id));
5543
+ for (int64_t id = 0; id < g_device_count; ++id) {
5544
+ CUDA_CHECK(ggml_cuda_set_device(id));
5146
5545
 
5147
- // create main stream
5148
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5546
+ // create cuda streams
5547
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5548
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5549
+ }
5149
5550
 
5150
5551
  // create cublas handle
5151
5552
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5214,7 +5615,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5214
5615
  if (src->backend == GGML_BACKEND_CPU) {
5215
5616
  kind = cudaMemcpyHostToDevice;
5216
5617
  src_ptr = (char *) src->data;
5217
- } else if (src->backend == GGML_BACKEND_GPU) {
5618
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5619
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5218
5620
  kind = cudaMemcpyDeviceToDevice;
5219
5621
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5220
5622
  int id;
@@ -5253,236 +5655,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5253
5655
  }
5254
5656
 
5255
5657
  inline void ggml_cuda_op_add(
5256
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5257
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5258
- cudaStream_t & cudaStream_main){
5259
-
5260
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5261
- GGML_ASSERT(src1_ddf_i != nullptr);
5262
- GGML_ASSERT(dst_ddf_i != nullptr);
5658
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5659
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5263
5660
 
5264
- const int64_t ne00 = src0->ne[0];
5265
- const int64_t i01_diff = i01_high - i01_low;
5661
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5266
5662
 
5267
5663
  const int64_t ne10 = src1->ne[0];
5268
5664
  const int64_t ne11 = src1->ne[1];
5269
5665
 
5270
- // compute
5271
5666
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5272
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5667
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5273
5668
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5274
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5669
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5275
5670
  } else {
5276
5671
  GGML_ASSERT(false);
5277
5672
  }
5278
5673
 
5279
5674
  (void) src1;
5280
5675
  (void) dst;
5281
- (void) src0_ddq_i;
5282
- (void) i02;
5283
- (void) i1;
5284
5676
  }
5285
5677
 
5286
5678
  inline void ggml_cuda_op_mul(
5287
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5288
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5289
- cudaStream_t & cudaStream_main){
5679
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5680
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5290
5681
 
5291
- GGML_ASSERT(src0_ddf_i != nullptr);
5292
- GGML_ASSERT(src1_ddf_i != nullptr);
5293
- GGML_ASSERT(dst_ddf_i != nullptr);
5294
-
5295
- const int64_t ne00 = src0->ne[0];
5296
- const int64_t i01_diff = i01_high - i01_low;
5682
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5683
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5684
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5297
5685
 
5298
5686
  const int64_t ne10 = src1->ne[0];
5299
5687
  const int64_t ne11 = src1->ne[1];
5300
5688
 
5301
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5689
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5302
5690
 
5303
5691
  (void) dst;
5304
- (void) src0_ddq_i;
5305
- (void) i02;
5306
- (void) i1;
5307
5692
  }
5308
5693
 
5309
5694
  inline void ggml_cuda_op_gelu(
5310
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5311
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5312
- cudaStream_t & cudaStream_main){
5313
-
5314
- GGML_ASSERT(src0_ddf_i != nullptr);
5315
- GGML_ASSERT(dst_ddf_i != nullptr);
5695
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5696
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5316
5697
 
5317
- const int64_t ne00 = src0->ne[0];
5318
- const int64_t i01_diff = i01_high - i01_low;
5698
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5699
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5319
5700
 
5320
- // compute
5321
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5701
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5322
5702
 
5323
5703
  (void) src1;
5324
5704
  (void) dst;
5325
- (void) src0_ddq_i;
5326
- (void) src1_ddf_i;
5327
- (void) i02;
5328
- (void) i1;
5705
+ (void) src1_dd;
5329
5706
  }
5330
5707
 
5331
5708
  inline void ggml_cuda_op_silu(
5332
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5333
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5334
- cudaStream_t & cudaStream_main){
5335
-
5336
- GGML_ASSERT(src0_ddf_i != nullptr);
5337
- GGML_ASSERT(dst_ddf_i != nullptr);
5709
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5710
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5338
5711
 
5339
- const int64_t ne00 = src0->ne[0];
5340
- const int64_t i01_diff = i01_high - i01_low;
5712
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5713
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5341
5714
 
5342
- // compute
5343
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5715
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5344
5716
 
5345
5717
  (void) src1;
5346
5718
  (void) dst;
5347
- (void) src0_ddq_i;
5348
- (void) src1_ddf_i;
5349
- (void) i02;
5350
- (void) i1;
5719
+ (void) src1_dd;
5351
5720
  }
5352
5721
 
5353
5722
  inline void ggml_cuda_op_norm(
5354
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5355
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5356
- cudaStream_t & cudaStream_main){
5723
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5724
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5357
5725
 
5358
- GGML_ASSERT(src0_ddf_i != nullptr);
5359
- GGML_ASSERT(dst_ddf_i != nullptr);
5726
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5727
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5360
5728
 
5361
5729
  const int64_t ne00 = src0->ne[0];
5362
- const int64_t i01_diff = i01_high - i01_low;
5730
+ const int64_t nrows = ggml_nrows(src0);
5363
5731
 
5364
- // compute
5365
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5732
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5366
5733
 
5367
5734
  (void) src1;
5368
5735
  (void) dst;
5369
- (void) src0_ddq_i;
5370
- (void) src1_ddf_i;
5371
- (void) i02;
5372
- (void) i1;
5736
+ (void) src1_dd;
5373
5737
  }
5374
5738
 
5375
5739
  inline void ggml_cuda_op_rms_norm(
5376
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5377
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5378
- cudaStream_t & cudaStream_main){
5740
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5741
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5379
5742
 
5380
- GGML_ASSERT(src0_ddf_i != nullptr);
5381
- GGML_ASSERT(dst_ddf_i != nullptr);
5743
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5744
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5382
5745
 
5383
5746
  const int64_t ne00 = src0->ne[0];
5384
- const int64_t i01_diff = i01_high - i01_low;
5747
+ const int64_t nrows = ggml_nrows(src0);
5385
5748
 
5386
5749
  float eps;
5387
5750
  memcpy(&eps, dst->op_params, sizeof(float));
5388
5751
 
5389
- // compute
5390
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5752
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5391
5753
 
5392
5754
  (void) src1;
5393
5755
  (void) dst;
5394
- (void) src0_ddq_i;
5395
- (void) src1_ddf_i;
5396
- (void) i02;
5397
- (void) i1;
5756
+ (void) src1_dd;
5398
5757
  }
5399
5758
 
5400
5759
  inline void ggml_cuda_op_mul_mat_q(
5401
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5402
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5403
- cudaStream_t & cudaStream_main){
5404
-
5405
- GGML_ASSERT(src0_ddq_i != nullptr);
5406
- GGML_ASSERT(src1_ddf_i != nullptr);
5407
- GGML_ASSERT(dst_ddf_i != nullptr);
5760
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5761
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5762
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5408
5763
 
5409
5764
  const int64_t ne00 = src0->ne[0];
5410
5765
 
5411
5766
  const int64_t ne10 = src1->ne[0];
5412
- const int64_t ne11 = src1->ne[1];
5413
5767
  GGML_ASSERT(ne10 % QK8_1 == 0);
5414
5768
 
5415
5769
  const int64_t ne0 = dst->ne[0];
5416
5770
 
5417
- const int64_t i01_diff = i01_high - i01_low;
5771
+ const int64_t row_diff = row_high - row_low;
5418
5772
 
5419
5773
  int id;
5420
5774
  CUDA_CHECK(cudaGetDevice(&id));
5421
5775
 
5422
5776
  // the main device has a larger memory buffer to hold the results from all GPUs
5423
5777
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5424
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5425
-
5426
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5427
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5428
- size_t as;
5429
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5430
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5778
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5431
5779
 
5432
5780
  switch (src0->type) {
5433
5781
  case GGML_TYPE_Q4_0:
5434
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5782
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5435
5783
  break;
5436
5784
  case GGML_TYPE_Q4_1:
5437
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5785
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5438
5786
  break;
5439
5787
  case GGML_TYPE_Q5_0:
5440
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5788
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5441
5789
  break;
5442
5790
  case GGML_TYPE_Q5_1:
5443
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5791
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5444
5792
  break;
5445
5793
  case GGML_TYPE_Q8_0:
5446
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5794
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5447
5795
  break;
5448
5796
  case GGML_TYPE_Q2_K:
5449
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5797
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5450
5798
  break;
5451
5799
  case GGML_TYPE_Q3_K:
5452
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5800
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5453
5801
  break;
5454
5802
  case GGML_TYPE_Q4_K:
5455
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5803
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5456
5804
  break;
5457
5805
  case GGML_TYPE_Q5_K:
5458
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5806
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5459
5807
  break;
5460
5808
  case GGML_TYPE_Q6_K:
5461
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5809
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5462
5810
  break;
5463
5811
  default:
5464
5812
  GGML_ASSERT(false);
5465
5813
  break;
5466
5814
  }
5467
5815
 
5468
- ggml_cuda_pool_free(src1_q8_1, as);
5469
-
5470
5816
  (void) src1;
5471
5817
  (void) dst;
5472
- (void) src0_ddf_i;
5473
- (void) i02;
5474
- (void) i1;
5818
+ (void) src1_ddf_i;
5475
5819
  }
5476
5820
 
5477
5821
  static int64_t get_row_rounding(ggml_type type) {
5478
- int max_compute_capability = INT_MIN;
5479
- for (int id = 0; id < g_device_count; ++id) {
5480
- if (max_compute_capability < g_compute_capabilities[id]
5481
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5482
- max_compute_capability = g_compute_capabilities[id];
5822
+ int64_t min_compute_capability = INT_MAX;
5823
+ int64_t max_compute_capability = INT_MIN;
5824
+ for (int64_t id = 0; id < g_device_count; ++id) {
5825
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5826
+ if (min_compute_capability > g_compute_capabilities[id]) {
5827
+ min_compute_capability = g_compute_capabilities[id];
5828
+ }
5829
+ if (max_compute_capability < g_compute_capabilities[id]) {
5830
+ max_compute_capability = g_compute_capabilities[id];
5831
+ }
5483
5832
  }
5484
5833
  }
5485
5834
 
5835
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5836
+ switch(type) {
5837
+ case GGML_TYPE_Q4_0:
5838
+ case GGML_TYPE_Q4_1:
5839
+ case GGML_TYPE_Q5_0:
5840
+ case GGML_TYPE_Q5_1:
5841
+ case GGML_TYPE_Q8_0:
5842
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5843
+ case GGML_TYPE_F16:
5844
+ return 1;
5845
+ case GGML_TYPE_Q2_K:
5846
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5847
+ case GGML_TYPE_Q3_K:
5848
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5849
+ case GGML_TYPE_Q4_K:
5850
+ case GGML_TYPE_Q5_K:
5851
+ case GGML_TYPE_Q6_K:
5852
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5853
+ default:
5854
+ GGML_ASSERT(false);
5855
+ }
5856
+ #else
5486
5857
  switch(type) {
5487
5858
  case GGML_TYPE_Q4_0:
5488
5859
  case GGML_TYPE_Q4_1:
@@ -5503,170 +5874,147 @@ static int64_t get_row_rounding(ggml_type type) {
5503
5874
  default:
5504
5875
  GGML_ASSERT(false);
5505
5876
  }
5877
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5506
5878
  }
5507
5879
 
5508
- inline void ggml_cuda_op_mul_mat_vec(
5509
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5510
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5511
- cudaStream_t & cudaStream_main){
5512
-
5513
- GGML_ASSERT(src0_ddq_i != nullptr);
5514
- GGML_ASSERT(src1_ddf_i != nullptr);
5515
- GGML_ASSERT(dst_ddf_i != nullptr);
5880
+ inline void ggml_cuda_op_mul_mat_vec_q(
5881
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5882
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5883
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5516
5884
 
5517
5885
  const int64_t ne00 = src0->ne[0];
5518
- const int64_t nrows = i01_high - i01_low;
5886
+ const int64_t row_diff = row_high - row_low;
5519
5887
 
5520
- #ifdef GGML_CUDA_FORCE_DMMV
5521
- const bool use_mul_mat_vec_q = false;
5522
- (void) g_compute_capabilities[0];
5523
- #else
5524
- int id;
5525
- CUDA_CHECK(cudaGetDevice(&id));
5888
+ switch (src0->type) {
5889
+ case GGML_TYPE_Q4_0:
5890
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5891
+ break;
5892
+ case GGML_TYPE_Q4_1:
5893
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5894
+ break;
5895
+ case GGML_TYPE_Q5_0:
5896
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5897
+ break;
5898
+ case GGML_TYPE_Q5_1:
5899
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5900
+ break;
5901
+ case GGML_TYPE_Q8_0:
5902
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5903
+ break;
5904
+ case GGML_TYPE_Q2_K:
5905
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5906
+ break;
5907
+ case GGML_TYPE_Q3_K:
5908
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5909
+ break;
5910
+ case GGML_TYPE_Q4_K:
5911
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5912
+ break;
5913
+ case GGML_TYPE_Q5_K:
5914
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5915
+ break;
5916
+ case GGML_TYPE_Q6_K:
5917
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5918
+ break;
5919
+ default:
5920
+ GGML_ASSERT(false);
5921
+ break;
5922
+ }
5526
5923
 
5527
- bool mul_mat_vec_q_implemented =
5528
- src0->type == GGML_TYPE_Q4_0 ||
5529
- src0->type == GGML_TYPE_Q4_1 ||
5530
- src0->type == GGML_TYPE_Q5_0 ||
5531
- src0->type == GGML_TYPE_Q5_1 ||
5532
- src0->type == GGML_TYPE_Q8_0;
5533
- #if QK_K == 256
5534
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5535
- src0->type == GGML_TYPE_Q2_K ||
5536
- src0->type == GGML_TYPE_Q3_K ||
5537
- src0->type == GGML_TYPE_Q4_K ||
5538
- src0->type == GGML_TYPE_Q5_K ||
5539
- src0->type == GGML_TYPE_Q6_K;
5540
- #endif // QK_K == 256
5541
-
5542
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5543
- #endif
5924
+ (void) src1;
5925
+ (void) dst;
5926
+ (void) src1_ddf_i;
5927
+ (void) src1_ncols;
5928
+ (void) src1_padded_row_size;
5929
+ }
5544
5930
 
5545
- if (use_mul_mat_vec_q) {
5546
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5547
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5548
- size_t as;
5549
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5550
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5551
-
5552
- switch (src0->type) {
5553
- case GGML_TYPE_Q4_0:
5554
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q4_1:
5557
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_0:
5560
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q5_1:
5563
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q8_0:
5566
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q2_K:
5569
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q3_K:
5572
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q4_K:
5575
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q5_K:
5578
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_Q6_K:
5581
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5931
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5932
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5933
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5934
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5587
5935
 
5588
- ggml_cuda_pool_free(src1_q8_1, as);
5589
- } else {
5590
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5936
+ const int64_t ne00 = src0->ne[0];
5937
+ const int64_t row_diff = row_high - row_low;
5938
+
5939
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5591
5940
  #ifdef GGML_CUDA_F16
5592
- size_t ash;
5593
- dfloat * src1_dfloat = nullptr; // dfloat == half
5594
-
5595
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5596
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5597
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5598
-
5599
- if (src1_convert_f16) {
5600
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5601
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5602
- ne00, 1, sizeof(float), 0, 0,
5603
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5604
- }
5941
+ size_t ash;
5942
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5943
+
5944
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5945
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5946
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5947
+
5948
+ if (src1_convert_f16) {
5949
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5950
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5951
+ ne00, 1, sizeof(float), 0, 0,
5952
+ ne00, 1, sizeof(half), 0, 0, stream);
5953
+ }
5605
5954
  #else
5606
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5955
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5607
5956
  #endif // GGML_CUDA_F16
5608
5957
 
5609
- switch (src0->type) {
5610
- case GGML_TYPE_Q4_0:
5611
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5612
- break;
5613
- case GGML_TYPE_Q4_1:
5614
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5615
- break;
5616
- case GGML_TYPE_Q5_0:
5617
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5618
- break;
5619
- case GGML_TYPE_Q5_1:
5620
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5621
- break;
5622
- case GGML_TYPE_Q8_0:
5623
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5624
- break;
5625
- case GGML_TYPE_Q2_K:
5626
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5627
- break;
5628
- case GGML_TYPE_Q3_K:
5629
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5630
- break;
5631
- case GGML_TYPE_Q4_K:
5632
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5633
- break;
5634
- case GGML_TYPE_Q5_K:
5635
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5636
- break;
5637
- case GGML_TYPE_Q6_K:
5638
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5639
- break;
5640
- case GGML_TYPE_F16:
5641
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5642
- break;
5643
- default:
5644
- GGML_ASSERT(false);
5645
- break;
5646
- }
5958
+ switch (src0->type) {
5959
+ case GGML_TYPE_Q4_0:
5960
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5961
+ break;
5962
+ case GGML_TYPE_Q4_1:
5963
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5964
+ break;
5965
+ case GGML_TYPE_Q5_0:
5966
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5967
+ break;
5968
+ case GGML_TYPE_Q5_1:
5969
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5970
+ break;
5971
+ case GGML_TYPE_Q8_0:
5972
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5973
+ break;
5974
+ case GGML_TYPE_Q2_K:
5975
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5976
+ break;
5977
+ case GGML_TYPE_Q3_K:
5978
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5979
+ break;
5980
+ case GGML_TYPE_Q4_K:
5981
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5982
+ break;
5983
+ case GGML_TYPE_Q5_K:
5984
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5985
+ break;
5986
+ case GGML_TYPE_Q6_K:
5987
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5988
+ break;
5989
+ case GGML_TYPE_F16:
5990
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5991
+ break;
5992
+ default:
5993
+ GGML_ASSERT(false);
5994
+ break;
5995
+ }
5647
5996
 
5648
5997
  #ifdef GGML_CUDA_F16
5649
- if (src1_convert_f16) {
5650
- ggml_cuda_pool_free(src1_dfloat, ash);
5651
- }
5652
- #endif // GGML_CUDA_F16
5998
+ if (src1_convert_f16) {
5999
+ ggml_cuda_pool_free(src1_dfloat, ash);
5653
6000
  }
6001
+ #endif // GGML_CUDA_F16
5654
6002
 
5655
6003
  (void) src1;
5656
6004
  (void) dst;
5657
- (void) src0_ddf_i;
5658
- (void) i02;
5659
- (void) i1;
6005
+ (void) src1_ddq_i;
6006
+ (void) src1_ncols;
6007
+ (void) src1_padded_row_size;
5660
6008
  }
5661
6009
 
5662
6010
  inline void ggml_cuda_op_mul_mat_cublas(
5663
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5664
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5665
- cudaStream_t & cudaStream_main){
6011
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6012
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6013
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5666
6014
 
5667
- GGML_ASSERT(src0_ddf_i != nullptr);
6015
+ GGML_ASSERT(src0_dd_i != nullptr);
5668
6016
  GGML_ASSERT(src1_ddf_i != nullptr);
5669
- GGML_ASSERT(dst_ddf_i != nullptr);
6017
+ GGML_ASSERT(dst_dd_i != nullptr);
5670
6018
 
5671
6019
  const float alpha = 1.0f;
5672
6020
  const float beta = 0.0f;
@@ -5674,43 +6022,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5674
6022
  const int64_t ne00 = src0->ne[0];
5675
6023
 
5676
6024
  const int64_t ne10 = src1->ne[0];
5677
- const int64_t ne11 = src1->ne[1];
5678
6025
 
5679
6026
  const int64_t ne0 = dst->ne[0];
5680
- const int64_t i01_diff = i01_high - i01_low;
6027
+ const int64_t row_diff = row_high - row_low;
6028
+
6029
+ float * src0_ddq_as_f32;
6030
+ size_t src0_as = 0;
6031
+
6032
+ if (src0->type != GGML_TYPE_F32) {
6033
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6034
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6035
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6036
+ }
6037
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5681
6038
 
5682
6039
  int id;
5683
6040
  CUDA_CHECK(cudaGetDevice(&id));
5684
6041
 
5685
6042
  // the main device has a larger memory buffer to hold the results from all GPUs
5686
6043
  // ldc == nrows of the matrix that cuBLAS writes into
5687
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6044
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5688
6045
 
5689
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6046
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5690
6047
  CUBLAS_CHECK(
5691
6048
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5692
- i01_diff, ne11, ne10,
6049
+ row_diff, src1_ncols, ne10,
5693
6050
  &alpha, src0_ddf_i, ne00,
5694
- src1_ddf_i, ne10,
5695
- &beta, dst_ddf_i, ldc));
6051
+ src1_ddf_i, ne10,
6052
+ &beta, dst_dd_i, ldc));
6053
+
6054
+ if (src0_as > 0) {
6055
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6056
+ }
5696
6057
 
5697
6058
  (void) dst;
5698
- (void) src0_ddq_i;
5699
- (void) i02;
5700
- (void) i1;
6059
+ (void) src1_ddq_i;
6060
+ (void) src1_padded_row_size;
5701
6061
  }
5702
6062
 
5703
6063
  inline void ggml_cuda_op_rope(
5704
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5705
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5706
- cudaStream_t & cudaStream_main){
6064
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6065
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5707
6066
 
5708
- GGML_ASSERT(src0_ddf_i != nullptr);
5709
- GGML_ASSERT(dst_ddf_i != nullptr);
6067
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6068
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5710
6069
 
5711
6070
  const int64_t ne00 = src0->ne[0];
5712
6071
  const int64_t ne01 = src0->ne[1];
5713
- const int64_t i01_diff = i01_high - i01_low;
6072
+ const int64_t nrows = ggml_nrows(src0);
5714
6073
 
5715
6074
  const int n_past = ((int32_t *) dst->op_params)[0];
5716
6075
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5723,44 +6082,37 @@ inline void ggml_cuda_op_rope(
5723
6082
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5724
6083
 
5725
6084
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6085
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5726
6086
 
5727
6087
  const bool is_neox = mode & 2;
5728
6088
  const bool is_glm = mode & 4;
5729
6089
 
5730
6090
  // compute
5731
6091
  if (is_glm) {
5732
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5733
- const float id_p = min(p, n_ctx - 2.f);
5734
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5735
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6092
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5736
6093
  } else if (is_neox) {
5737
6094
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5738
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5739
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6095
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5740
6096
  } else {
5741
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5742
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6097
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5743
6098
  }
5744
6099
 
5745
6100
  (void) src1;
5746
6101
  (void) dst;
5747
- (void) src0_ddq_i;
5748
- (void) src1_ddf_i;
5749
- (void) i1;
6102
+ (void) src1_dd;
5750
6103
  }
5751
6104
 
5752
6105
  inline void ggml_cuda_op_alibi(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6106
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6107
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6108
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6109
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6110
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6111
 
5760
6112
  const int64_t ne00 = src0->ne[0];
5761
6113
  const int64_t ne01 = src0->ne[1];
5762
6114
  const int64_t ne02 = src0->ne[2];
5763
- const int64_t i01_diff = i01_high - i01_low;
6115
+ const int64_t nrows = ggml_nrows(src0);
5764
6116
 
5765
6117
  const int n_past = ((int32_t *) dst->op_params)[0];
5766
6118
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5775,334 +6127,393 @@ inline void ggml_cuda_op_alibi(
5775
6127
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5776
6128
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5777
6129
 
5778
- // compute
5779
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6130
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5780
6131
 
5781
6132
  (void) src1;
5782
- (void) src0_ddq_i;
5783
- (void) src1_ddf_i;
5784
- (void) i1;
6133
+ (void) src1_dd;
5785
6134
  }
5786
6135
 
5787
6136
  inline void ggml_cuda_op_diag_mask_inf(
5788
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5789
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5790
- cudaStream_t & cudaStream_main){
6137
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6138
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5791
6139
 
5792
- GGML_ASSERT(src0_ddf_i != nullptr);
5793
- GGML_ASSERT(dst_ddf_i != nullptr);
6140
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6141
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5794
6142
 
5795
6143
  const int64_t ne00 = src0->ne[0];
5796
6144
  const int64_t ne01 = src0->ne[1];
5797
- const int64_t i01_diff = i01_high - i01_low;
6145
+ const int nrows0 = ggml_nrows(src0);
5798
6146
 
5799
6147
  const int n_past = ((int32_t *) dst->op_params)[0];
5800
6148
 
5801
- // compute
5802
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6149
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5803
6150
 
5804
6151
  (void) src1;
5805
6152
  (void) dst;
5806
- (void) src0_ddq_i;
5807
- (void) src1_ddf_i;
5808
- (void) i02;
5809
- (void) i1;
6153
+ (void) src1_dd;
5810
6154
  }
5811
6155
 
5812
6156
  inline void ggml_cuda_op_soft_max(
5813
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5814
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5815
- cudaStream_t & cudaStream_main){
6157
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6158
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5816
6159
 
5817
- GGML_ASSERT(src0_ddf_i != nullptr);
5818
- GGML_ASSERT(dst_ddf_i != nullptr);
6160
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6161
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5819
6162
 
5820
6163
  const int64_t ne00 = src0->ne[0];
5821
- const int64_t i01_diff = i01_high - i01_low;
6164
+ const int64_t nrows = ggml_nrows(src0);
5822
6165
 
5823
- // compute
5824
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6166
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5825
6167
 
5826
6168
  (void) src1;
5827
6169
  (void) dst;
5828
- (void) src0_ddq_i;
5829
- (void) src1_ddf_i;
5830
- (void) i02;
5831
- (void) i1;
6170
+ (void) src1_dd;
5832
6171
  }
5833
6172
 
5834
6173
  inline void ggml_cuda_op_scale(
5835
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5836
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5837
- cudaStream_t & cudaStream_main){
6174
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6175
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5838
6176
 
5839
- GGML_ASSERT(src0_ddf_i != nullptr);
5840
- GGML_ASSERT(dst_ddf_i != nullptr);
6177
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6178
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6179
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5841
6180
 
5842
6181
  const float scale = ((float *) src1->data)[0];
5843
6182
 
5844
- const int64_t ne00 = src0->ne[0];
5845
- const int64_t i01_diff = i01_high - i01_low;
5846
-
5847
- // compute
5848
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6183
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5849
6184
  CUDA_CHECK(cudaGetLastError());
5850
6185
 
5851
6186
  (void) src1;
5852
6187
  (void) dst;
5853
- (void) src0_ddq_i;
5854
- (void) src1_ddf_i;
5855
- (void) i02;
5856
- (void) i1;
6188
+ (void) src1_dd;
6189
+ }
6190
+
6191
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6192
+ const int64_t nrows0 = ggml_nrows(src0);
6193
+
6194
+ const bool use_src1 = src1 != nullptr;
6195
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6196
+
6197
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6198
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6199
+
6200
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6201
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6202
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6203
+
6204
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6205
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6206
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6207
+
6208
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6209
+
6210
+ // dd = data device
6211
+ float * src0_ddf = nullptr;
6212
+ float * src1_ddf = nullptr;
6213
+ float * dst_ddf = nullptr;
6214
+
6215
+ // as = actual size
6216
+ size_t src0_asf = 0;
6217
+ size_t src1_asf = 0;
6218
+ size_t dst_asf = 0;
6219
+
6220
+ ggml_cuda_set_device(g_main_device);
6221
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6222
+
6223
+ if (src0_on_device) {
6224
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6225
+ } else {
6226
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6227
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6228
+ }
6229
+
6230
+ if (use_src1 && !src1_stays_on_host) {
6231
+ if (src1_on_device) {
6232
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6233
+ } else {
6234
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6235
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6236
+ }
6237
+ }
6238
+ if (dst_on_device) {
6239
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6240
+ } else {
6241
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6242
+ }
6243
+
6244
+ // do the computation
6245
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6246
+ CUDA_CHECK(cudaGetLastError());
6247
+
6248
+ // copy dst to host if necessary
6249
+ if (!dst_on_device) {
6250
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6251
+ }
6252
+
6253
+ if (src0_asf > 0) {
6254
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6255
+ }
6256
+ if (src1_asf > 0) {
6257
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6258
+ }
6259
+ if (dst_asf > 0) {
6260
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6261
+ }
6262
+
6263
+ if (dst->backend == GGML_BACKEND_CPU) {
6264
+ CUDA_CHECK(cudaDeviceSynchronize());
6265
+ }
6266
+ }
6267
+
6268
+ void ggml_cuda_set_peer_access(const int n_tokens) {
6269
+ static bool peer_access_enabled = false;
6270
+
6271
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6272
+
6273
+ if (peer_access_enabled == enable_peer_access) {
6274
+ return;
6275
+ }
6276
+
6277
+ #ifdef NDEBUG
6278
+ for (int id = 0; id < g_device_count; ++id) {
6279
+ CUDA_CHECK(ggml_cuda_set_device(id));
6280
+
6281
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6282
+ if (id == id_other) {
6283
+ continue;
6284
+ }
6285
+ if (id != g_main_device && id_other != g_main_device) {
6286
+ continue;
6287
+ }
6288
+
6289
+ int can_access_peer;
6290
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6291
+ if (can_access_peer) {
6292
+ if (enable_peer_access) {
6293
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6294
+ } else {
6295
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6296
+ }
6297
+ }
6298
+ }
6299
+ }
6300
+ #endif // NDEBUG
6301
+
6302
+ peer_access_enabled = enable_peer_access;
5857
6303
  }
5858
6304
 
5859
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5860
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6305
+ static void ggml_cuda_op_mul_mat(
6306
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6307
+ const bool convert_src1_to_q8_1) {
6308
+
5861
6309
  const int64_t ne00 = src0->ne[0];
5862
6310
  const int64_t ne01 = src0->ne[1];
5863
6311
  const int64_t ne02 = src0->ne[2];
5864
6312
  const int64_t ne03 = src0->ne[3];
5865
6313
  const int64_t nrows0 = ggml_nrows(src0);
5866
6314
 
5867
- const bool use_src1 = src1 != nullptr;
5868
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5869
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5870
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5871
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5872
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6315
+ const int64_t ne10 = src1->ne[0];
6316
+ const int64_t ne11 = src1->ne[1];
6317
+ const int64_t ne12 = src1->ne[2];
6318
+ const int64_t ne13 = src1->ne[3];
6319
+ const int64_t nrows1 = ggml_nrows(src1);
5873
6320
 
5874
6321
  GGML_ASSERT(ne03 == ne13);
5875
6322
 
5876
6323
  const int64_t ne0 = dst->ne[0];
5877
6324
  const int64_t ne1 = dst->ne[1];
5878
6325
 
5879
- const int nb2 = dst->nb[2];
5880
- const int nb3 = dst->nb[3];
6326
+ const int nb2 = dst->nb[2];
6327
+ const int nb3 = dst->nb[3];
6328
+
6329
+ ggml_cuda_set_peer_access(ne11);
5881
6330
 
5882
6331
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5883
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6332
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5884
6333
 
5885
- // strides for iteration over dims 3 and 2
5886
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5887
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5888
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5889
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5890
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5891
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6334
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5892
6335
 
5893
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5894
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5895
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5896
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5897
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6336
+ const int64_t i02_divisor = ne12 / ne02;
5898
6337
 
5899
6338
  const size_t src0_ts = ggml_type_size(src0->type);
5900
6339
  const size_t src0_bs = ggml_blck_size(src0->type);
6340
+ const size_t q8_1_ts = sizeof(block_q8_1);
6341
+ const size_t q8_1_bs = QK8_1;
5901
6342
 
5902
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5903
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5904
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6343
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6345
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5905
6346
 
5906
6347
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5907
6348
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5908
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5909
6349
 
5910
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5911
- const bool src1_stays_on_host = use_src1 && (
5912
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6350
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6351
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6352
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5913
6353
 
5914
6354
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6355
+ GGML_ASSERT(!(split && ne02 > 1));
6356
+ GGML_ASSERT(!(split && ne03 > 1));
5915
6357
  GGML_ASSERT(!(split && ne02 < ne12));
5916
6358
 
5917
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5918
-
5919
6359
  // dd = data device
5920
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5921
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5922
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5923
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5924
-
5925
- // asq = actual size quantized, asf = actual size float
5926
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5927
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
5928
- size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5929
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6360
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6361
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6362
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6363
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
5930
6364
 
5931
- // if multiple devices are used they need to wait for the main device
5932
- // here an event is recorded that signifies that the main device has finished calculating the input data
5933
- if (split && g_device_count > 1) {
5934
- CUDA_CHECK(cudaSetDevice(g_main_device));
5935
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5936
- }
6365
+ // as = actual size
6366
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
6367
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
6368
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6369
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5937
6370
 
5938
- for (int id = 0; id < g_device_count; ++id) {
5939
- if (!split && id != g_main_device) {
5940
- continue;
5941
- }
6371
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6372
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5942
6373
 
5943
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5944
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6374
+ for (int64_t id = 0; id < g_device_count; ++id) {
6375
+ // by default, use all rows
6376
+ row_low[id] = 0;
6377
+ row_high[id] = ne01;
5945
6378
 
5946
- int64_t row_low, row_high;
6379
+ // for multi GPU, get the row boundaries from tensor split
6380
+ // and round to mul_mat_q tile sizes
5947
6381
  if (split) {
5948
6382
  const int64_t rounding = get_row_rounding(src0->type);
5949
6383
 
5950
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5951
- row_low -= row_low % rounding;
6384
+ if (id != 0) {
6385
+ row_low[id] = ne01*g_tensor_split[id];
6386
+ row_low[id] -= row_low[id] % rounding;
6387
+ }
5952
6388
 
5953
- if (id == g_device_count - 1) {
5954
- row_high = nrows0;
5955
- } else {
5956
- row_high = nrows0*g_tensor_split[id + 1];
5957
- row_high -= row_high % rounding;
6389
+ if (id != g_device_count - 1) {
6390
+ row_high[id] = ne01*g_tensor_split[id + 1];
6391
+ row_high[id] -= row_high[id] % rounding;
5958
6392
  }
5959
- } else {
5960
- row_low = 0;
5961
- row_high = nrows0*i02_divisor;
5962
6393
  }
5963
- if (row_low == row_high) {
6394
+ }
6395
+
6396
+ for (int64_t id = 0; id < g_device_count; ++id) {
6397
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5964
6398
  continue;
5965
6399
  }
5966
6400
 
5967
- int64_t row_diff = row_high - row_low;
5968
-
5969
- cudaSetDevice(id);
5970
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6401
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6402
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5971
6403
 
5972
- // wait for main GPU data if necessary
5973
- if (split && id != g_main_device) {
5974
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5975
- }
6404
+ ggml_cuda_set_device(id);
6405
+ const cudaStream_t stream = g_cudaStreams[id][0];
5976
6406
 
5977
6407
  if (src0_on_device && src0_is_contiguous) {
5978
- if (src0_is_f32) {
5979
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5980
- } else {
5981
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5982
- }
6408
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5983
6409
  } else {
5984
- if (src0_is_f32) {
5985
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5986
- } else {
5987
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5988
- }
6410
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6411
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5989
6412
  }
5990
6413
 
5991
- if (src0_needs_f32 && !src0_is_f32) {
5992
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6414
+ if (src1_on_device && src1_is_contiguous) {
6415
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6416
+ } else {
6417
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5993
6418
  }
5994
6419
 
5995
- if (use_src1 && !src1_stays_on_host) {
5996
- if (src1_on_device && src1_is_contiguous) {
5997
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5998
- } else {
5999
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6420
+ if (convert_src1_to_q8_1) {
6421
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6422
+
6423
+ if (split && src1_on_device && src1_is_contiguous) {
6424
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6425
+ CUDA_CHECK(cudaGetLastError());
6000
6426
  }
6001
6427
  }
6428
+
6002
6429
  if (dst_on_device) {
6003
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6430
+ dst_dd[id] = (float *) dst_extra->data_device[id];
6004
6431
  } else {
6005
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
6006
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6432
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6433
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
6007
6434
  }
6435
+ }
6436
+
6437
+ // if multiple devices are used they need to wait for the main device
6438
+ // here an event is recorded that signals that the main device has finished calculating the input data
6439
+ if (split && g_device_count > 1) {
6440
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6441
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6442
+ }
6008
6443
 
6009
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
6010
- const int64_t i13 = i03 % ne13;
6011
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
6012
- const int64_t i12 = i02 % ne12;
6444
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6445
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6446
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6447
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
6013
6448
 
6014
- const int64_t i0 = i03*i02_max + i02;
6449
+ for (int64_t id = 0; id < g_device_count; ++id) {
6450
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6451
+ continue;
6452
+ }
6015
6453
 
6016
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6017
- const int64_t i0_offset_low = row_low/rows_per_iter;
6018
- const int64_t i0_offset_high = row_high/rows_per_iter;
6454
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6455
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6456
+ const int64_t row_diff = row_high[id] - row_low[id];
6019
6457
 
6020
- int64_t i01_low = 0;
6021
- int64_t i01_high = rows_per_iter;
6022
- if (split) {
6023
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
6024
- continue;
6025
- }
6026
- if (i0 == i0_offset_low) {
6027
- i01_low = row_low % rows_per_iter;
6028
- }
6029
- if (i0 == i0_offset_high) {
6030
- i01_high = row_high % rows_per_iter;
6031
- }
6032
- }
6458
+ ggml_cuda_set_device(id);
6459
+ const cudaStream_t stream = g_cudaStreams[id][is];
6460
+
6461
+ // wait for main GPU data if necessary
6462
+ if (split && (id != g_main_device || is != 0)) {
6463
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6464
+ }
6033
6465
 
6034
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6035
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6036
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6037
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6038
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
6039
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6466
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6467
+ const int64_t i03 = i0 / ne12;
6468
+ const int64_t i02 = i0 % ne12;
6040
6469
 
6041
- const int64_t i01_diff = i01_high - i01_low;
6042
- if (i01_diff == 0) {
6043
- continue;
6044
- }
6045
- const int64_t i11 = i13*ne12 + i12;
6470
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
6046
6471
 
6047
6472
  // for split tensors the data begins at i0 == i0_offset_low
6048
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6049
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6050
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6051
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6052
-
6053
- // for split tensors the data pointer needs to be rounded down
6054
- // to the bin edge for i03, i02 bins beyond the first
6055
- if (i0 - i0_offset_low > 0) {
6056
- GGML_ASSERT(!flatten_rows);
6057
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6058
- src0_ddf_i -= (row_low % ne01)*ne00;
6059
- dst_ddf_i -= (row_low % ne0)*ne1;
6060
- }
6473
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6474
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6475
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6476
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6061
6477
 
6062
6478
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6063
6479
  // in that case an offset on dst_ddf_i is needed
6064
6480
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6065
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6481
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6066
6482
  }
6067
6483
 
6068
6484
  // copy src0, src1 to device if necessary
6069
- if (use_src1 && !src1_stays_on_host) {
6070
- if (src1->backend == GGML_BACKEND_CPU) {
6071
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6072
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6073
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6074
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6075
- if (id != g_main_device) {
6076
- GGML_ASSERT(!flatten_rows);
6485
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6486
+ if (id != g_main_device) {
6487
+ if (convert_src1_to_q8_1) {
6488
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6489
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6490
+ cudaMemcpyDeviceToDevice, stream));
6491
+ } else {
6077
6492
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6078
- src1_ddf_i_source += i11*src1_stride;
6079
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6080
- cudaMemcpyDeviceToDevice, cudaStream_main));
6493
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6494
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6495
+ cudaMemcpyDeviceToDevice, stream));
6081
6496
  }
6082
- } else if (src1_on_device && !src1_is_contiguous) {
6083
- GGML_ASSERT(!split);
6084
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6085
- } else {
6086
- GGML_ASSERT(false);
6087
6497
  }
6498
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6499
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6500
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6501
+ } else {
6502
+ GGML_ASSERT(false);
6088
6503
  }
6089
6504
 
6090
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6091
- if (src0_is_f32) {
6092
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6093
- } else {
6094
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6095
- }
6505
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6506
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6507
+ CUDA_CHECK(cudaGetLastError());
6096
6508
  }
6097
6509
 
6098
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6099
- if (src0_needs_f32 && !src0_is_f32) {
6100
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6101
- CUDA_CHECK(cudaGetLastError());
6510
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6511
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6102
6512
  }
6103
6513
 
6104
6514
  // do the computation
6105
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6515
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6516
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6106
6517
  CUDA_CHECK(cudaGetLastError());
6107
6518
 
6108
6519
  // copy dst to host or other device if necessary
@@ -6124,95 +6535,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6124
6535
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6125
6536
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6126
6537
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6127
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6128
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6129
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6538
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6539
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6540
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6541
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6542
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6130
6543
  } else {
6131
6544
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6132
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6545
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6546
+ dhf_dst_i += src1_col_0*ne0;
6547
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6133
6548
  }
6134
6549
  }
6135
6550
 
6136
- // signify to main device that other device is done
6137
- if (split && g_device_count > 1 && id != g_main_device) {
6138
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6551
+ // add event for the main device to wait on until other device is done
6552
+ if (split && (id != g_main_device || is != 0)) {
6553
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6139
6554
  }
6140
6555
  }
6141
6556
  }
6142
6557
  }
6143
6558
 
6144
- // wait until each device is finished, then free their buffers
6145
- for (int id = 0; id < g_device_count; ++id) {
6146
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6147
- continue;
6148
- }
6149
-
6150
- CUDA_CHECK(cudaSetDevice(id));
6559
+ for (int64_t id = 0; id < g_device_count; ++id) {
6560
+ CUDA_CHECK(ggml_cuda_set_device(id));
6151
6561
 
6152
- if (src0_asq[id] > 0) {
6153
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6154
- }
6155
- if (src0_asf[id] > 0) {
6156
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6562
+ // free buffers again when done
6563
+ if (src0_as[id] > 0) {
6564
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6157
6565
  }
6158
6566
  if (src1_asf[id] > 0) {
6159
6567
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6160
6568
  }
6161
- if (dst_asf[id] > 0) {
6162
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6569
+ if (src1_asq[id] > 0) {
6570
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6571
+ }
6572
+ if (dst_as[id] > 0) {
6573
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6163
6574
  }
6164
6575
  }
6165
6576
 
6166
6577
  // main device waits for all other devices to be finished
6167
6578
  if (split && g_device_count > 1) {
6168
- CUDA_CHECK(cudaSetDevice(g_main_device));
6169
- for (int id = 0; id < g_device_count; ++id) {
6170
- if (id != g_main_device && src0_extra->events[id]) {
6171
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6579
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6580
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6581
+
6582
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6583
+ for (int64_t id = 0; id < g_device_count; ++id) {
6584
+ for (int64_t is = 0; is < is_max; ++is) {
6585
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6172
6586
  }
6173
6587
  }
6174
6588
  }
6175
6589
 
6176
6590
  if (dst->backend == GGML_BACKEND_CPU) {
6177
- CUDA_CHECK(cudaSetDevice(g_main_device));
6591
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6178
6592
  CUDA_CHECK(cudaDeviceSynchronize());
6179
6593
  }
6180
6594
  }
6181
6595
 
6182
6596
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6183
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6184
- // Due to flatten_rows == true this does in practice not make a difference however.
6185
- // Better solution would be nice but right now that would require disproportionate changes.
6186
- GGML_ASSERT(
6187
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6188
- src1->type == GGML_TYPE_F32 &&
6189
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6190
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6597
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6191
6598
  }
6192
6599
 
6193
6600
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6194
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6195
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6601
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6196
6602
  }
6197
6603
 
6198
6604
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6199
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6200
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6605
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6201
6606
  }
6202
6607
 
6203
6608
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6204
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6205
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6609
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6206
6610
  }
6207
6611
 
6208
6612
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6209
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6210
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6613
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6211
6614
  }
6212
6615
 
6213
6616
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6214
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6215
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6617
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6216
6618
  }
6217
6619
 
6218
6620
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6246,8 +6648,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6246
6648
 
6247
6649
  const int64_t ne12 = src1->ne[2];
6248
6650
 
6249
- CUDA_CHECK(cudaSetDevice(g_main_device));
6250
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6651
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6652
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6251
6653
 
6252
6654
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6253
6655
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6258,7 +6660,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6258
6660
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6259
6661
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6260
6662
 
6261
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6663
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6262
6664
  }
6263
6665
 
6264
6666
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6277,8 +6679,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6277
6679
  const int64_t nb01 = src0->nb[1];
6278
6680
  const int64_t nb02 = src0->nb[2];
6279
6681
 
6280
- CUDA_CHECK(cudaSetDevice(g_main_device));
6281
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6682
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6683
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6282
6684
 
6283
6685
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6284
6686
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6289,38 +6691,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6289
6691
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6290
6692
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6291
6693
 
6292
- const int row_stride_x = nb01 / sizeof(half);
6293
- const int channel_stride_x = nb02 / sizeof(half);
6694
+ const int64_t row_stride_x = nb01 / sizeof(half);
6695
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6294
6696
 
6295
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6697
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6296
6698
  }
6297
6699
 
6298
6700
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6299
6701
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6300
6702
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6301
6703
 
6704
+ int64_t min_compute_capability = INT_MAX;
6705
+ for (int64_t id = 0; id < g_device_count; ++id) {
6706
+ if (min_compute_capability > g_compute_capabilities[id]
6707
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6708
+ min_compute_capability = g_compute_capabilities[id];
6709
+ }
6710
+ }
6711
+
6302
6712
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6303
6713
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6304
6714
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6305
6715
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6306
6716
  }else if (src0->type == GGML_TYPE_F32) {
6307
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6717
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6308
6718
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6309
6719
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6310
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6311
- } else {
6312
- int min_compute_capability = INT_MAX;
6313
- for (int id = 0; id < g_device_count; ++id) {
6314
- if (min_compute_capability > g_compute_capabilities[id]
6315
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6316
- min_compute_capability = g_compute_capabilities[id];
6317
- }
6318
- }
6319
6720
 
6721
+ #ifdef GGML_CUDA_FORCE_DMMV
6722
+ const bool use_mul_mat_vec_q = false;
6723
+ #else
6724
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6725
+ #endif // GGML_CUDA_FORCE_DMMV
6726
+
6727
+ if (use_mul_mat_vec_q) {
6728
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6729
+ } else {
6730
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6731
+ }
6732
+ } else {
6320
6733
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6321
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6734
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6322
6735
  } else {
6323
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6736
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6324
6737
  }
6325
6738
  }
6326
6739
  } else {
@@ -6329,8 +6742,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6329
6742
  }
6330
6743
 
6331
6744
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6332
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6333
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6745
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6334
6746
  }
6335
6747
 
6336
6748
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6359,8 +6771,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6359
6771
  const int64_t nb11 = src1->nb[1];
6360
6772
  const int64_t nb12 = src1->nb[2];
6361
6773
 
6362
- CUDA_CHECK(cudaSetDevice(g_main_device));
6363
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6774
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6775
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6364
6776
 
6365
6777
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
6778
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6370,10 +6782,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6370
6782
 
6371
6783
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6372
6784
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6373
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6785
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6374
6786
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6375
6787
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6376
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6788
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6377
6789
  } else {
6378
6790
  GGML_ASSERT(false);
6379
6791
  }
@@ -6387,28 +6799,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6387
6799
  }
6388
6800
 
6389
6801
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6390
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6391
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6802
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6392
6803
  }
6393
6804
 
6394
6805
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6395
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6396
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6806
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6397
6807
  }
6398
6808
 
6399
6809
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6400
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
6810
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6402
-
6403
- const int mode = ((int32_t *) dst->op_params)[2];
6404
- const bool is_glm = mode & 4;
6405
-
6406
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6811
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6407
6812
  }
6408
6813
 
6409
6814
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6410
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6411
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6815
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6412
6816
  }
6413
6817
 
6414
6818
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6418,7 +6822,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6418
6822
  }
6419
6823
 
6420
6824
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6421
- int nrows = ggml_nrows(tensor);
6825
+ const int64_t nrows = ggml_nrows(tensor);
6422
6826
 
6423
6827
  const int64_t ne0 = tensor->ne[0];
6424
6828
 
@@ -6428,14 +6832,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6428
6832
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6429
6833
  memset(extra, 0, sizeof(*extra));
6430
6834
 
6431
- for (int id = 0; id < g_device_count; ++id) {
6835
+ for (int64_t id = 0; id < g_device_count; ++id) {
6432
6836
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6433
6837
  continue;
6434
6838
  }
6435
6839
 
6436
- cudaSetDevice(id);
6840
+ ggml_cuda_set_device(id);
6437
6841
 
6438
- int row_low, row_high;
6842
+ int64_t row_low, row_high;
6439
6843
  if (backend == GGML_BACKEND_GPU) {
6440
6844
  row_low = 0;
6441
6845
  row_high = nrows;
@@ -6485,7 +6889,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6485
6889
  extra->data_device[id] = buf;
6486
6890
 
6487
6891
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6488
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6892
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6893
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6894
+ }
6489
6895
  }
6490
6896
  }
6491
6897
 
@@ -6499,15 +6905,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6499
6905
 
6500
6906
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6501
6907
 
6502
- for (int id = 0; id < g_device_count; ++id) {
6908
+ for (int64_t id = 0; id < g_device_count; ++id) {
6503
6909
  if (extra->data_device[id] != nullptr) {
6504
- CUDA_CHECK(cudaSetDevice(id));
6910
+ CUDA_CHECK(ggml_cuda_set_device(id));
6505
6911
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6506
6912
  }
6507
6913
 
6508
- if (extra->events[id] != nullptr) {
6509
- CUDA_CHECK(cudaSetDevice(id));
6510
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6914
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6915
+ if (extra->events[id][is] != nullptr) {
6916
+ CUDA_CHECK(ggml_cuda_set_device(id));
6917
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6918
+ }
6511
6919
  }
6512
6920
  }
6513
6921
 
@@ -6559,7 +6967,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6559
6967
  force_inplace;
6560
6968
  const size_t size = ggml_nbytes(tensor);
6561
6969
 
6562
- CUDA_CHECK(cudaSetDevice(g_main_device));
6970
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6563
6971
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6564
6972
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6565
6973
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
@@ -6608,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6608
7016
  return;
6609
7017
  }
6610
7018
  if (g_scratch_buffer == nullptr) {
7019
+ ggml_cuda_set_device(g_main_device);
6611
7020
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6612
7021
  }
6613
7022
 
@@ -6647,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6647
7056
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6648
7057
  }
6649
7058
 
6650
- void ggml_cuda_set_main_device(int main_device) {
7059
+ void ggml_cuda_set_main_device(const int main_device) {
6651
7060
  if (main_device >= g_device_count) {
6652
7061
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
6653
7062
  main_device, g_device_count, g_main_device);
@@ -6661,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
6661
7070
  }
6662
7071
  }
6663
7072
 
6664
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7073
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
6665
7074
  g_mul_mat_q = mul_mat_q;
6666
7075
  }
6667
7076
 
6668
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7077
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
6669
7078
  g_scratch_size = scratch_size;
6670
7079
  }
6671
7080