llama_cpp 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -31,6 +31,9 @@
31
31
  #define cublasSetStream hipblasSetStream
32
32
  #define cublasSgemm hipblasSgemm
33
33
  #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
35
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
36
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
37
  #define cudaDeviceProp hipDeviceProp_t
35
38
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
39
  #define cudaError_t hipError_t
@@ -61,26 +64,36 @@
61
64
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
65
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
66
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
67
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
68
  #define cudaStream_t hipStream_t
66
69
  #define cudaSuccess hipSuccess
67
70
  #else
68
71
  #include <cuda_runtime.h>
69
72
  #include <cublas_v2.h>
70
73
  #include <cuda_fp16.h>
71
- #endif
74
+ #endif // defined(GGML_USE_HIPBLAS)
72
75
 
73
76
  #include "ggml-cuda.h"
74
77
  #include "ggml.h"
75
78
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
79
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
80
+ #define CC_TURING 700
81
+ #define CC_OFFSET_AMD 1000000
82
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
83
 
81
84
  #if defined(GGML_USE_HIPBLAS)
82
85
  #define __CUDA_ARCH__ 1300
83
86
 
87
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
88
+ defined(__gfx1150__) || defined(__gfx1151__)
89
+ #define RDNA3
90
+ #endif
91
+
92
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
93
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
94
+ #define RDNA2
95
+ #endif
96
+
84
97
  #ifndef __has_builtin
85
98
  #define __has_builtin(x) 0
86
99
  #endif
@@ -132,7 +145,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
132
145
  #endif
133
146
  return c;
134
147
  }
135
- #endif
148
+ #endif // defined(GGML_USE_HIPBLAS)
136
149
 
137
150
  #if defined(_MSC_VER)
138
151
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -144,8 +157,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
144
157
  do { \
145
158
  cudaError_t err_ = (err); \
146
159
  if (err_ != cudaSuccess) { \
147
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
160
+ int id; \
161
+ cudaGetDevice(&id); \
162
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
148
163
  cudaGetErrorString(err_)); \
164
+ fprintf(stderr, "current device: %d\n", id); \
149
165
  exit(1); \
150
166
  } \
151
167
  } while (0)
@@ -155,8 +171,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
155
171
  do { \
156
172
  cublasStatus_t err_ = (err); \
157
173
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
174
+ int id; \
175
+ cudaGetDevice(&id); \
158
176
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
159
177
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
178
+ fprintf(stderr, "current device: %d\n", id); \
160
179
  exit(1); \
161
180
  } \
162
181
  } while (0)
@@ -165,12 +184,21 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
165
184
  do { \
166
185
  cublasStatus_t err_ = (err); \
167
186
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
187
+ int id; \
188
+ cudaGetDevice(&id); \
168
189
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
190
+ fprintf(stderr, "current device: %d\n", id); \
169
191
  exit(1); \
170
192
  } \
171
193
  } while (0)
172
194
  #endif // CUDART_VERSION >= 11
173
195
 
196
+ #if CUDART_VERSION >= 11100
197
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
198
+ #else
199
+ #define GGML_CUDA_ASSUME(x)
200
+ #endif // CUDART_VERSION >= 11100
201
+
174
202
  #ifdef GGML_CUDA_F16
175
203
  typedef half dfloat; // dequantize float
176
204
  typedef half2 dfloat2;
@@ -212,10 +240,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
212
240
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
213
241
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
214
242
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
215
- typedef void (*ggml_cuda_op_t)(
216
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
217
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
218
- cudaStream_t & cudaStream_main);
243
+ typedef void (*ggml_cuda_op_mul_mat_t)(
244
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
245
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
246
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
247
+ typedef void (*ggml_cuda_op_flatten_t)(
248
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
249
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
219
250
 
220
251
  // QK = number of values after dequantization
221
252
  // QR = QK / number of values before dequantization
@@ -396,11 +427,33 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
396
427
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
397
428
  #endif
398
429
 
430
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
431
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
432
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
433
+
434
+ #define MUL_MAT_SRC1_COL_STRIDE 128
435
+
436
+ #define MAX_STREAMS 8
437
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
438
+
399
439
  struct ggml_tensor_extra_gpu {
400
440
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
401
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
441
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
402
442
  };
403
443
 
444
+ // this is faster on Windows
445
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
446
+ inline cudaError_t ggml_cuda_set_device(const int device) {
447
+ int current_device;
448
+ CUDA_CHECK(cudaGetDevice(&current_device));
449
+
450
+ if (device == current_device) {
451
+ return cudaSuccess;
452
+ }
453
+
454
+ return cudaSetDevice(device);
455
+ }
456
+
404
457
  static int g_device_count = -1;
405
458
  static int g_main_device = 0;
406
459
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -413,8 +466,6 @@ static size_t g_scratch_offset = 0;
413
466
 
414
467
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
415
468
 
416
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
417
-
418
469
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
419
470
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
420
471
 
@@ -2107,10 +2158,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2107
2158
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2108
2159
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2109
2160
 
2110
- __builtin_assume(i_offset >= 0);
2111
- __builtin_assume(i_offset < nwarps);
2112
- __builtin_assume(k >= 0);
2113
- __builtin_assume(k < WARP_SIZE);
2161
+ GGML_CUDA_ASSUME(i_offset >= 0);
2162
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2163
+ GGML_CUDA_ASSUME(k >= 0);
2164
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2114
2165
 
2115
2166
  const int kbx = k / QI4_0;
2116
2167
  const int kqsx = k % QI4_0;
@@ -2201,10 +2252,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2201
2252
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2202
2253
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2203
2254
 
2204
- __builtin_assume(i_offset >= 0);
2205
- __builtin_assume(i_offset < nwarps);
2206
- __builtin_assume(k >= 0);
2207
- __builtin_assume(k < WARP_SIZE);
2255
+ GGML_CUDA_ASSUME(i_offset >= 0);
2256
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2257
+ GGML_CUDA_ASSUME(k >= 0);
2258
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2208
2259
 
2209
2260
  const int kbx = k / QI4_1;
2210
2261
  const int kqsx = k % QI4_1;
@@ -2293,10 +2344,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2293
2344
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2294
2345
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2295
2346
 
2296
- __builtin_assume(i_offset >= 0);
2297
- __builtin_assume(i_offset < nwarps);
2298
- __builtin_assume(k >= 0);
2299
- __builtin_assume(k < WARP_SIZE);
2347
+ GGML_CUDA_ASSUME(i_offset >= 0);
2348
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2349
+ GGML_CUDA_ASSUME(k >= 0);
2350
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2300
2351
 
2301
2352
  const int kbx = k / QI5_0;
2302
2353
  const int kqsx = k % QI5_0;
@@ -2407,10 +2458,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2407
2458
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2408
2459
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2409
2460
 
2410
- __builtin_assume(i_offset >= 0);
2411
- __builtin_assume(i_offset < nwarps);
2412
- __builtin_assume(k >= 0);
2413
- __builtin_assume(k < WARP_SIZE);
2461
+ GGML_CUDA_ASSUME(i_offset >= 0);
2462
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2463
+ GGML_CUDA_ASSUME(k >= 0);
2464
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2414
2465
 
2415
2466
  const int kbx = k / QI5_1;
2416
2467
  const int kqsx = k % QI5_1;
@@ -2513,10 +2564,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2513
2564
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2514
2565
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2515
2566
 
2516
- __builtin_assume(i_offset >= 0);
2517
- __builtin_assume(i_offset < nwarps);
2518
- __builtin_assume(k >= 0);
2519
- __builtin_assume(k < WARP_SIZE);
2567
+ GGML_CUDA_ASSUME(i_offset >= 0);
2568
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2569
+ GGML_CUDA_ASSUME(k >= 0);
2570
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2520
2571
 
2521
2572
  const int kbx = k / QI8_0;
2522
2573
  const int kqsx = k % QI8_0;
@@ -2604,10 +2655,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2604
2655
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2605
2656
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2606
2657
 
2607
- __builtin_assume(i_offset >= 0);
2608
- __builtin_assume(i_offset < nwarps);
2609
- __builtin_assume(k >= 0);
2610
- __builtin_assume(k < WARP_SIZE);
2658
+ GGML_CUDA_ASSUME(i_offset >= 0);
2659
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2660
+ GGML_CUDA_ASSUME(k >= 0);
2661
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2611
2662
 
2612
2663
  const int kbx = k / QI2_K;
2613
2664
  const int kqsx = k % QI2_K;
@@ -2725,10 +2776,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2725
2776
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2726
2777
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2727
2778
 
2728
- __builtin_assume(i_offset >= 0);
2729
- __builtin_assume(i_offset < nwarps);
2730
- __builtin_assume(k >= 0);
2731
- __builtin_assume(k < WARP_SIZE);
2779
+ GGML_CUDA_ASSUME(i_offset >= 0);
2780
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2781
+ GGML_CUDA_ASSUME(k >= 0);
2782
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2732
2783
 
2733
2784
  const int kbx = k / QI3_K;
2734
2785
  const int kqsx = k % QI3_K;
@@ -2943,10 +2994,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2943
2994
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2944
2995
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2945
2996
 
2946
- __builtin_assume(i_offset >= 0);
2947
- __builtin_assume(i_offset < nwarps);
2948
- __builtin_assume(k >= 0);
2949
- __builtin_assume(k < WARP_SIZE);
2997
+ GGML_CUDA_ASSUME(i_offset >= 0);
2998
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2999
+ GGML_CUDA_ASSUME(k >= 0);
3000
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2950
3001
 
2951
3002
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2952
3003
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3124,10 +3175,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3124
3175
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3125
3176
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3126
3177
 
3127
- __builtin_assume(i_offset >= 0);
3128
- __builtin_assume(i_offset < nwarps);
3129
- __builtin_assume(k >= 0);
3130
- __builtin_assume(k < WARP_SIZE);
3178
+ GGML_CUDA_ASSUME(i_offset >= 0);
3179
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3180
+ GGML_CUDA_ASSUME(k >= 0);
3181
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3131
3182
 
3132
3183
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3133
3184
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3253,10 +3304,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3253
3304
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3254
3305
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3255
3306
 
3256
- __builtin_assume(i_offset >= 0);
3257
- __builtin_assume(i_offset < nwarps);
3258
- __builtin_assume(k >= 0);
3259
- __builtin_assume(k < WARP_SIZE);
3307
+ GGML_CUDA_ASSUME(i_offset >= 0);
3308
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3309
+ GGML_CUDA_ASSUME(k >= 0);
3310
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3260
3311
 
3261
3312
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3262
3313
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -3444,6 +3495,12 @@ static __device__ __forceinline__ void mul_mat_q(
3444
3495
  }
3445
3496
  }
3446
3497
 
3498
+ #define MMQ_X_Q4_0_RDNA2 64
3499
+ #define MMQ_Y_Q4_0_RDNA2 128
3500
+ #define NWARPS_Q4_0_RDNA2 8
3501
+ #define MMQ_X_Q4_0_RDNA1 64
3502
+ #define MMQ_Y_Q4_0_RDNA1 64
3503
+ #define NWARPS_Q4_0_RDNA1 8
3447
3504
  #define MMQ_X_Q4_0_AMPERE 64
3448
3505
  #define MMQ_Y_Q4_0_AMPERE 128
3449
3506
  #define NWARPS_Q4_0_AMPERE 4
@@ -3451,11 +3508,32 @@ static __device__ __forceinline__ void mul_mat_q(
3451
3508
  #define MMQ_Y_Q4_0_PASCAL 64
3452
3509
  #define NWARPS_Q4_0_PASCAL 8
3453
3510
 
3454
- template <bool need_check> static __global__ void mul_mat_q4_0(
3511
+ template <bool need_check> static __global__ void
3512
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3513
+ #if defined(RDNA3) || defined(RDNA2)
3514
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3515
+ #endif // defined(RDNA3) || defined(RDNA2)
3516
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3517
+ mul_mat_q4_0(
3455
3518
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3456
3519
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3457
3520
 
3458
- #if __CUDA_ARCH__ >= CC_TURING
3521
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3522
+ #if defined(RDNA3) || defined(RDNA2)
3523
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3524
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3525
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3526
+ #else
3527
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3528
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3529
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3530
+ #endif // defined(RDNA3) || defined(RDNA2)
3531
+
3532
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3533
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3534
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3535
+
3536
+ #elif __CUDA_ARCH__ >= CC_TURING
3459
3537
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3460
3538
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3461
3539
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3478,6 +3556,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3478
3556
  #endif // __CUDA_ARCH__ >= CC_TURING
3479
3557
  }
3480
3558
 
3559
+ #define MMQ_X_Q4_1_RDNA2 64
3560
+ #define MMQ_Y_Q4_1_RDNA2 128
3561
+ #define NWARPS_Q4_1_RDNA2 8
3562
+ #define MMQ_X_Q4_1_RDNA1 64
3563
+ #define MMQ_Y_Q4_1_RDNA1 64
3564
+ #define NWARPS_Q4_1_RDNA1 8
3481
3565
  #define MMQ_X_Q4_1_AMPERE 64
3482
3566
  #define MMQ_Y_Q4_1_AMPERE 128
3483
3567
  #define NWARPS_Q4_1_AMPERE 4
@@ -3486,14 +3570,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3486
3570
  #define NWARPS_Q4_1_PASCAL 8
3487
3571
 
3488
3572
  template <bool need_check> static __global__ void
3489
- #if __CUDA_ARCH__ < CC_TURING
3573
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3574
+ #if defined(RDNA3) || defined(RDNA2)
3575
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3576
+ #endif // defined(RDNA3) || defined(RDNA2)
3577
+ #elif __CUDA_ARCH__ < CC_TURING
3490
3578
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3491
3579
  #endif // __CUDA_ARCH__ < CC_TURING
3492
3580
  mul_mat_q4_1(
3493
3581
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3494
3582
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3495
3583
 
3496
- #if __CUDA_ARCH__ >= CC_TURING
3584
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3585
+ #if defined(RDNA3) || defined(RDNA2)
3586
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3587
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3588
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3589
+ #else
3590
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3591
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3592
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3593
+ #endif // defined(RDNA3) || defined(RDNA2)
3594
+
3595
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3596
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3597
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3598
+
3599
+ #elif __CUDA_ARCH__ >= CC_TURING
3497
3600
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3498
3601
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3499
3602
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3516,6 +3619,12 @@ template <bool need_check> static __global__ void
3516
3619
  #endif // __CUDA_ARCH__ >= CC_TURING
3517
3620
  }
3518
3621
 
3622
+ #define MMQ_X_Q5_0_RDNA2 64
3623
+ #define MMQ_Y_Q5_0_RDNA2 128
3624
+ #define NWARPS_Q5_0_RDNA2 8
3625
+ #define MMQ_X_Q5_0_RDNA1 64
3626
+ #define MMQ_Y_Q5_0_RDNA1 64
3627
+ #define NWARPS_Q5_0_RDNA1 8
3519
3628
  #define MMQ_X_Q5_0_AMPERE 128
3520
3629
  #define MMQ_Y_Q5_0_AMPERE 64
3521
3630
  #define NWARPS_Q5_0_AMPERE 4
@@ -3523,11 +3632,32 @@ template <bool need_check> static __global__ void
3523
3632
  #define MMQ_Y_Q5_0_PASCAL 64
3524
3633
  #define NWARPS_Q5_0_PASCAL 8
3525
3634
 
3526
- template <bool need_check> static __global__ void mul_mat_q5_0(
3635
+ template <bool need_check> static __global__ void
3636
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3637
+ #if defined(RDNA3) || defined(RDNA2)
3638
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3639
+ #endif // defined(RDNA3) || defined(RDNA2)
3640
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3641
+ mul_mat_q5_0(
3527
3642
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3528
3643
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3529
3644
 
3530
- #if __CUDA_ARCH__ >= CC_TURING
3645
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3646
+ #if defined(RDNA3) || defined(RDNA2)
3647
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3648
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3649
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3650
+ #else
3651
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3652
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3653
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3654
+ #endif // defined(RDNA3) || defined(RDNA2)
3655
+
3656
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3657
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3658
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3659
+
3660
+ #elif __CUDA_ARCH__ >= CC_TURING
3531
3661
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3532
3662
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3533
3663
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3550,6 +3680,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3550
3680
  #endif // __CUDA_ARCH__ >= CC_TURING
3551
3681
  }
3552
3682
 
3683
+ #define MMQ_X_Q5_1_RDNA2 64
3684
+ #define MMQ_Y_Q5_1_RDNA2 128
3685
+ #define NWARPS_Q5_1_RDNA2 8
3686
+ #define MMQ_X_Q5_1_RDNA1 64
3687
+ #define MMQ_Y_Q5_1_RDNA1 64
3688
+ #define NWARPS_Q5_1_RDNA1 8
3553
3689
  #define MMQ_X_Q5_1_AMPERE 128
3554
3690
  #define MMQ_Y_Q5_1_AMPERE 64
3555
3691
  #define NWARPS_Q5_1_AMPERE 4
@@ -3557,11 +3693,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3557
3693
  #define MMQ_Y_Q5_1_PASCAL 64
3558
3694
  #define NWARPS_Q5_1_PASCAL 8
3559
3695
 
3560
- template <bool need_check> static __global__ void mul_mat_q5_1(
3696
+ template <bool need_check> static __global__ void
3697
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3698
+ #if defined(RDNA3) || defined(RDNA2)
3699
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3700
+ #endif // defined(RDNA3) || defined(RDNA2)
3701
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3702
+ mul_mat_q5_1(
3561
3703
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3562
3704
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3563
3705
 
3564
- #if __CUDA_ARCH__ >= CC_TURING
3706
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3707
+ #if defined(RDNA3) || defined(RDNA2)
3708
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3709
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3710
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3711
+ #else
3712
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3713
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3714
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3715
+ #endif // defined(RDNA3) || defined(RDNA2)
3716
+
3717
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3718
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3719
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3720
+
3721
+ #elif __CUDA_ARCH__ >= CC_TURING
3565
3722
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3566
3723
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3567
3724
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3584,6 +3741,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3584
3741
  #endif // __CUDA_ARCH__ >= CC_TURING
3585
3742
  }
3586
3743
 
3744
+ #define MMQ_X_Q8_0_RDNA2 64
3745
+ #define MMQ_Y_Q8_0_RDNA2 128
3746
+ #define NWARPS_Q8_0_RDNA2 8
3747
+ #define MMQ_X_Q8_0_RDNA1 64
3748
+ #define MMQ_Y_Q8_0_RDNA1 64
3749
+ #define NWARPS_Q8_0_RDNA1 8
3587
3750
  #define MMQ_X_Q8_0_AMPERE 128
3588
3751
  #define MMQ_Y_Q8_0_AMPERE 64
3589
3752
  #define NWARPS_Q8_0_AMPERE 4
@@ -3591,11 +3754,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3591
3754
  #define MMQ_Y_Q8_0_PASCAL 64
3592
3755
  #define NWARPS_Q8_0_PASCAL 8
3593
3756
 
3594
- template <bool need_check> static __global__ void mul_mat_q8_0(
3757
+ template <bool need_check> static __global__ void
3758
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3759
+ #if defined(RDNA3) || defined(RDNA2)
3760
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3761
+ #endif // defined(RDNA3) || defined(RDNA2)
3762
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3763
+ mul_mat_q8_0(
3595
3764
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3596
3765
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3597
3766
 
3598
- #if __CUDA_ARCH__ >= CC_TURING
3767
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3768
+ #if defined(RDNA3) || defined(RDNA2)
3769
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3770
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3771
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3772
+ #else
3773
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3774
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3775
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3776
+ #endif // defined(RDNA3) || defined(RDNA2)
3777
+
3778
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3779
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3780
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3781
+
3782
+ #elif __CUDA_ARCH__ >= CC_TURING
3599
3783
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3600
3784
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3601
3785
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3618,6 +3802,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3618
3802
  #endif // __CUDA_ARCH__ >= CC_TURING
3619
3803
  }
3620
3804
 
3805
+ #define MMQ_X_Q2_K_RDNA2 64
3806
+ #define MMQ_Y_Q2_K_RDNA2 128
3807
+ #define NWARPS_Q2_K_RDNA2 8
3808
+ #define MMQ_X_Q2_K_RDNA1 128
3809
+ #define MMQ_Y_Q2_K_RDNA1 32
3810
+ #define NWARPS_Q2_K_RDNA1 8
3621
3811
  #define MMQ_X_Q2_K_AMPERE 64
3622
3812
  #define MMQ_Y_Q2_K_AMPERE 128
3623
3813
  #define NWARPS_Q2_K_AMPERE 4
@@ -3625,11 +3815,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3625
3815
  #define MMQ_Y_Q2_K_PASCAL 64
3626
3816
  #define NWARPS_Q2_K_PASCAL 8
3627
3817
 
3628
- template <bool need_check> static __global__ void mul_mat_q2_K(
3818
+ template <bool need_check> static __global__ void
3819
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3820
+ #if defined(RDNA3) || defined(RDNA2)
3821
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3822
+ #endif // defined(RDNA3) || defined(RDNA2)
3823
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3824
+ mul_mat_q2_K(
3629
3825
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3630
3826
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3631
3827
 
3632
- #if __CUDA_ARCH__ >= CC_TURING
3828
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3829
+ #if defined(RDNA3) || defined(RDNA2)
3830
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3831
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3832
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3833
+ #else
3834
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3835
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3836
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3837
+ #endif // defined(RDNA3) || defined(RDNA2)
3838
+
3839
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3840
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3841
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3842
+
3843
+ #elif __CUDA_ARCH__ >= CC_TURING
3633
3844
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3634
3845
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3635
3846
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3652,6 +3863,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3652
3863
  #endif // __CUDA_ARCH__ >= CC_TURING
3653
3864
  }
3654
3865
 
3866
+ #define MMQ_X_Q3_K_RDNA2 128
3867
+ #define MMQ_Y_Q3_K_RDNA2 64
3868
+ #define NWARPS_Q3_K_RDNA2 8
3869
+ #define MMQ_X_Q3_K_RDNA1 32
3870
+ #define MMQ_Y_Q3_K_RDNA1 128
3871
+ #define NWARPS_Q3_K_RDNA1 8
3655
3872
  #define MMQ_X_Q3_K_AMPERE 128
3656
3873
  #define MMQ_Y_Q3_K_AMPERE 128
3657
3874
  #define NWARPS_Q3_K_AMPERE 4
@@ -3660,14 +3877,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3660
3877
  #define NWARPS_Q3_K_PASCAL 8
3661
3878
 
3662
3879
  template <bool need_check> static __global__ void
3663
- #if __CUDA_ARCH__ < CC_TURING
3880
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3881
+ #if defined(RDNA3) || defined(RDNA2)
3882
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3883
+ #endif // defined(RDNA3) || defined(RDNA2)
3884
+ #elif __CUDA_ARCH__ < CC_TURING
3664
3885
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3665
3886
  #endif // __CUDA_ARCH__ < CC_TURING
3666
3887
  mul_mat_q3_K(
3667
3888
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3668
3889
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3669
3890
 
3670
- #if __CUDA_ARCH__ >= CC_TURING
3891
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3892
+ #if defined(RDNA3) || defined(RDNA2)
3893
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3894
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3895
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3896
+ #else
3897
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3898
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3899
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3900
+ #endif // defined(RDNA3) || defined(RDNA2)
3901
+
3902
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3903
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3904
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3905
+
3906
+ #elif __CUDA_ARCH__ >= CC_TURING
3671
3907
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3672
3908
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3673
3909
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3690,6 +3926,12 @@ template <bool need_check> static __global__ void
3690
3926
  #endif // __CUDA_ARCH__ >= CC_TURING
3691
3927
  }
3692
3928
 
3929
+ #define MMQ_X_Q4_K_RDNA2 64
3930
+ #define MMQ_Y_Q4_K_RDNA2 128
3931
+ #define NWARPS_Q4_K_RDNA2 8
3932
+ #define MMQ_X_Q4_K_RDNA1 32
3933
+ #define MMQ_Y_Q4_K_RDNA1 64
3934
+ #define NWARPS_Q4_K_RDNA1 8
3693
3935
  #define MMQ_X_Q4_K_AMPERE 64
3694
3936
  #define MMQ_Y_Q4_K_AMPERE 128
3695
3937
  #define NWARPS_Q4_K_AMPERE 4
@@ -3698,14 +3940,33 @@ template <bool need_check> static __global__ void
3698
3940
  #define NWARPS_Q4_K_PASCAL 8
3699
3941
 
3700
3942
  template <bool need_check> static __global__ void
3701
- #if __CUDA_ARCH__ < CC_TURING
3943
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3944
+ #if defined(RDNA3) || defined(RDNA2)
3945
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3946
+ #endif // defined(RDNA3) || defined(RDNA2)
3947
+ #elif __CUDA_ARCH__ < CC_TURING
3702
3948
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3703
3949
  #endif // __CUDA_ARCH__ < CC_TURING
3704
3950
  mul_mat_q4_K(
3705
3951
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3706
3952
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3707
3953
 
3708
- #if __CUDA_ARCH__ >= CC_TURING
3954
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3955
+ #if defined(RDNA3) || defined(RDNA2)
3956
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3957
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3958
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3959
+ #else
3960
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3961
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3962
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3963
+ #endif // defined(RDNA3) || defined(RDNA2)
3964
+
3965
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3966
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3967
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3968
+
3969
+ #elif __CUDA_ARCH__ >= CC_TURING
3709
3970
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3710
3971
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3711
3972
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3728,6 +3989,12 @@ template <bool need_check> static __global__ void
3728
3989
  #endif // __CUDA_ARCH__ >= CC_TURING
3729
3990
  }
3730
3991
 
3992
+ #define MMQ_X_Q5_K_RDNA2 64
3993
+ #define MMQ_Y_Q5_K_RDNA2 128
3994
+ #define NWARPS_Q5_K_RDNA2 8
3995
+ #define MMQ_X_Q5_K_RDNA1 32
3996
+ #define MMQ_Y_Q5_K_RDNA1 64
3997
+ #define NWARPS_Q5_K_RDNA1 8
3731
3998
  #define MMQ_X_Q5_K_AMPERE 64
3732
3999
  #define MMQ_Y_Q5_K_AMPERE 128
3733
4000
  #define NWARPS_Q5_K_AMPERE 4
@@ -3735,11 +4002,32 @@ template <bool need_check> static __global__ void
3735
4002
  #define MMQ_Y_Q5_K_PASCAL 64
3736
4003
  #define NWARPS_Q5_K_PASCAL 8
3737
4004
 
3738
- template <bool need_check> static __global__ void mul_mat_q5_K(
4005
+ template <bool need_check> static __global__ void
4006
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4007
+ #if defined(RDNA3) || defined(RDNA2)
4008
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
4009
+ #endif // defined(RDNA3) || defined(RDNA2)
4010
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4011
+ mul_mat_q5_K(
3739
4012
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3740
4013
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3741
4014
 
3742
- #if __CUDA_ARCH__ >= CC_TURING
4015
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4016
+ #if defined(RDNA3) || defined(RDNA2)
4017
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4018
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4019
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4020
+ #else
4021
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4022
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4023
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4024
+ #endif // defined(RDNA3) || defined(RDNA2)
4025
+
4026
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4027
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4028
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4029
+
4030
+ #elif __CUDA_ARCH__ >= CC_TURING
3743
4031
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3744
4032
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3745
4033
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3762,6 +4050,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3762
4050
  #endif // __CUDA_ARCH__ >= CC_TURING
3763
4051
  }
3764
4052
 
4053
+ #define MMQ_X_Q6_K_RDNA2 64
4054
+ #define MMQ_Y_Q6_K_RDNA2 128
4055
+ #define NWARPS_Q6_K_RDNA2 8
4056
+ #define MMQ_X_Q6_K_RDNA1 32
4057
+ #define MMQ_Y_Q6_K_RDNA1 64
4058
+ #define NWARPS_Q6_K_RDNA1 8
3765
4059
  #define MMQ_X_Q6_K_AMPERE 64
3766
4060
  #define MMQ_Y_Q6_K_AMPERE 64
3767
4061
  #define NWARPS_Q6_K_AMPERE 4
@@ -3770,14 +4064,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3770
4064
  #define NWARPS_Q6_K_PASCAL 8
3771
4065
 
3772
4066
  template <bool need_check> static __global__ void
3773
- #if __CUDA_ARCH__ < CC_TURING
4067
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4068
+ #if defined(RDNA3) || defined(RDNA2)
4069
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4070
+ #endif // defined(RDNA3) || defined(RDNA2)
4071
+ #elif __CUDA_ARCH__ < CC_TURING
3774
4072
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3775
4073
  #endif // __CUDA_ARCH__ < CC_TURING
3776
4074
  mul_mat_q6_K(
3777
4075
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3778
4076
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3779
4077
 
3780
- #if __CUDA_ARCH__ >= CC_TURING
4078
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4079
+ #if defined(RDNA3) || defined(RDNA2)
4080
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4081
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4082
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4083
+ #else
4084
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4085
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4086
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4087
+ #endif // defined(RDNA3) || defined(RDNA2)
4088
+
4089
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4090
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4091
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4092
+
4093
+ #elif __CUDA_ARCH__ >= CC_TURING
3781
4094
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3782
4095
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3783
4096
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4086,7 +4399,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4086
4399
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4087
4400
  }
4088
4401
 
4089
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4402
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4403
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4090
4404
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4091
4405
  const int half_n_dims = ncols/4;
4092
4406
 
@@ -4098,8 +4412,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4098
4412
  const int i = row*ncols + col;
4099
4413
 
4100
4414
  const float col_theta_scale = powf(theta_scale, col);
4415
+ const float p = p0 + p_delta*(row/p_delta_rows);
4101
4416
 
4102
- const float theta = p*col_theta_scale;
4417
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4103
4418
  const float sin_theta = sinf(theta);
4104
4419
  const float cos_theta = cosf(theta);
4105
4420
 
@@ -4109,7 +4424,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4109
4424
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4110
4425
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4111
4426
 
4112
- const float block_theta = block_p*col_theta_scale;
4427
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4113
4428
  const float sin_block_theta = sinf(block_theta);
4114
4429
  const float cos_block_theta = cosf(block_theta);
4115
4430
 
@@ -4558,7 +4873,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4558
4873
  const int compute_capability = g_compute_capabilities[id];
4559
4874
 
4560
4875
  int mmq_x, mmq_y, nwarps;
4561
- if (compute_capability >= CC_TURING) {
4876
+ if (compute_capability >= CC_RDNA2) {
4877
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4878
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4879
+ nwarps = NWARPS_Q4_0_RDNA2;
4880
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4881
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4882
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4883
+ nwarps = NWARPS_Q4_0_RDNA1;
4884
+ } else if (compute_capability >= CC_TURING) {
4562
4885
  mmq_x = MMQ_X_Q4_0_AMPERE;
4563
4886
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4564
4887
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4595,7 +4918,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4595
4918
  const int compute_capability = g_compute_capabilities[id];
4596
4919
 
4597
4920
  int mmq_x, mmq_y, nwarps;
4598
- if (compute_capability >= CC_TURING) {
4921
+ if (compute_capability >= CC_RDNA2) {
4922
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4923
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4924
+ nwarps = NWARPS_Q4_1_RDNA2;
4925
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4926
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4927
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4928
+ nwarps = NWARPS_Q4_1_RDNA1;
4929
+ } else if (compute_capability >= CC_TURING) {
4599
4930
  mmq_x = MMQ_X_Q4_1_AMPERE;
4600
4931
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4601
4932
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4632,7 +4963,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4632
4963
  const int compute_capability = g_compute_capabilities[id];
4633
4964
 
4634
4965
  int mmq_x, mmq_y, nwarps;
4635
- if (compute_capability >= CC_TURING) {
4966
+ if (compute_capability >= CC_RDNA2) {
4967
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4968
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4969
+ nwarps = NWARPS_Q5_0_RDNA2;
4970
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4971
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4972
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4973
+ nwarps = NWARPS_Q5_0_RDNA1;
4974
+ } else if (compute_capability >= CC_TURING) {
4636
4975
  mmq_x = MMQ_X_Q5_0_AMPERE;
4637
4976
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4638
4977
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4669,7 +5008,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4669
5008
  const int compute_capability = g_compute_capabilities[id];
4670
5009
 
4671
5010
  int mmq_x, mmq_y, nwarps;
4672
- if (compute_capability >= CC_TURING) {
5011
+ if (compute_capability >= CC_RDNA2) {
5012
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5013
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5014
+ nwarps = NWARPS_Q5_1_RDNA2;
5015
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5016
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5017
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5018
+ nwarps = NWARPS_Q5_1_RDNA1;
5019
+ } else if (compute_capability >= CC_TURING) {
4673
5020
  mmq_x = MMQ_X_Q5_1_AMPERE;
4674
5021
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4675
5022
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4706,7 +5053,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4706
5053
  const int compute_capability = g_compute_capabilities[id];
4707
5054
 
4708
5055
  int mmq_x, mmq_y, nwarps;
4709
- if (compute_capability >= CC_TURING) {
5056
+ if (compute_capability >= CC_RDNA2) {
5057
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5058
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5059
+ nwarps = NWARPS_Q8_0_RDNA2;
5060
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5061
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5062
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5063
+ nwarps = NWARPS_Q8_0_RDNA1;
5064
+ } else if (compute_capability >= CC_TURING) {
4710
5065
  mmq_x = MMQ_X_Q8_0_AMPERE;
4711
5066
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4712
5067
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4743,7 +5098,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4743
5098
  const int compute_capability = g_compute_capabilities[id];
4744
5099
 
4745
5100
  int mmq_x, mmq_y, nwarps;
4746
- if (compute_capability >= CC_TURING) {
5101
+ if (compute_capability >= CC_RDNA2) {
5102
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5103
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5104
+ nwarps = NWARPS_Q2_K_RDNA2;
5105
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5106
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5107
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5108
+ nwarps = NWARPS_Q2_K_RDNA1;
5109
+ } else if (compute_capability >= CC_TURING) {
4747
5110
  mmq_x = MMQ_X_Q2_K_AMPERE;
4748
5111
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4749
5112
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4782,7 +5145,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4782
5145
  const int compute_capability = g_compute_capabilities[id];
4783
5146
 
4784
5147
  int mmq_x, mmq_y, nwarps;
4785
- if (compute_capability >= CC_TURING) {
5148
+ if (compute_capability >= CC_RDNA2) {
5149
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5150
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5151
+ nwarps = NWARPS_Q3_K_RDNA2;
5152
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5153
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5154
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5155
+ nwarps = NWARPS_Q3_K_RDNA1;
5156
+ } else if (compute_capability >= CC_TURING) {
4786
5157
  mmq_x = MMQ_X_Q3_K_AMPERE;
4787
5158
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4788
5159
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4820,7 +5191,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4820
5191
  const int compute_capability = g_compute_capabilities[id];
4821
5192
 
4822
5193
  int mmq_x, mmq_y, nwarps;
4823
- if (compute_capability >= CC_TURING) {
5194
+ if (compute_capability >= CC_RDNA2) {
5195
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5196
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5197
+ nwarps = NWARPS_Q4_K_RDNA2;
5198
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5199
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5200
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5201
+ nwarps = NWARPS_Q4_K_RDNA1;
5202
+ } else if (compute_capability >= CC_TURING) {
4824
5203
  mmq_x = MMQ_X_Q4_K_AMPERE;
4825
5204
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4826
5205
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4857,7 +5236,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4857
5236
  const int compute_capability = g_compute_capabilities[id];
4858
5237
 
4859
5238
  int mmq_x, mmq_y, nwarps;
4860
- if (compute_capability >= CC_TURING) {
5239
+ if (compute_capability >= CC_RDNA2) {
5240
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5241
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5242
+ nwarps = NWARPS_Q5_K_RDNA2;
5243
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5244
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5245
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5246
+ nwarps = NWARPS_Q5_K_RDNA1;
5247
+ } else if (compute_capability >= CC_TURING) {
4861
5248
  mmq_x = MMQ_X_Q5_K_AMPERE;
4862
5249
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4863
5250
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4894,7 +5281,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4894
5281
  const int compute_capability = g_compute_capabilities[id];
4895
5282
 
4896
5283
  int mmq_x, mmq_y, nwarps;
4897
- if (compute_capability >= CC_TURING) {
5284
+ if (compute_capability >= CC_RDNA2) {
5285
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5286
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5287
+ nwarps = NWARPS_Q6_K_RDNA2;
5288
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5289
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5290
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5291
+ nwarps = NWARPS_Q6_K_RDNA1;
5292
+ } else if (compute_capability >= CC_TURING) {
4898
5293
  mmq_x = MMQ_X_Q6_K_AMPERE;
4899
5294
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4900
5295
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4984,12 +5379,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4984
5379
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4985
5380
  }
4986
5381
 
4987
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4988
- GGML_ASSERT(nrows % 4 == 0);
4989
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4990
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5382
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5383
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5384
+ GGML_ASSERT(ncols % 4 == 0);
5385
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5386
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4991
5387
  const dim3 block_nums(num_blocks_x, nrows, 1);
4992
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5388
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4993
5389
  }
4994
5390
 
4995
5391
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5127,25 +5523,30 @@ void ggml_init_cublas() {
5127
5523
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5128
5524
  int64_t total_vram = 0;
5129
5525
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5130
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5131
5527
  cudaDeviceProp prop;
5132
5528
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5529
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5134
5530
 
5135
5531
  g_tensor_split[id] = total_vram;
5136
5532
  total_vram += prop.totalGlobalMem;
5137
-
5533
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5534
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5535
+ #else
5138
5536
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5537
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5139
5538
  }
5140
- for (int id = 0; id < g_device_count; ++id) {
5539
+ for (int64_t id = 0; id < g_device_count; ++id) {
5141
5540
  g_tensor_split[id] /= total_vram;
5142
5541
  }
5143
5542
 
5144
- for (int id = 0; id < g_device_count; ++id) {
5145
- CUDA_CHECK(cudaSetDevice(id));
5543
+ for (int64_t id = 0; id < g_device_count; ++id) {
5544
+ CUDA_CHECK(ggml_cuda_set_device(id));
5146
5545
 
5147
- // create main stream
5148
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5546
+ // create cuda streams
5547
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5548
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5549
+ }
5149
5550
 
5150
5551
  // create cublas handle
5151
5552
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5214,7 +5615,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5214
5615
  if (src->backend == GGML_BACKEND_CPU) {
5215
5616
  kind = cudaMemcpyHostToDevice;
5216
5617
  src_ptr = (char *) src->data;
5217
- } else if (src->backend == GGML_BACKEND_GPU) {
5618
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5619
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5218
5620
  kind = cudaMemcpyDeviceToDevice;
5219
5621
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5220
5622
  int id;
@@ -5253,236 +5655,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5253
5655
  }
5254
5656
 
5255
5657
  inline void ggml_cuda_op_add(
5256
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5257
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5258
- cudaStream_t & cudaStream_main){
5259
-
5260
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5261
- GGML_ASSERT(src1_ddf_i != nullptr);
5262
- GGML_ASSERT(dst_ddf_i != nullptr);
5658
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5659
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5263
5660
 
5264
- const int64_t ne00 = src0->ne[0];
5265
- const int64_t i01_diff = i01_high - i01_low;
5661
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5266
5662
 
5267
5663
  const int64_t ne10 = src1->ne[0];
5268
5664
  const int64_t ne11 = src1->ne[1];
5269
5665
 
5270
- // compute
5271
5666
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5272
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5667
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5273
5668
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5274
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5669
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5275
5670
  } else {
5276
5671
  GGML_ASSERT(false);
5277
5672
  }
5278
5673
 
5279
5674
  (void) src1;
5280
5675
  (void) dst;
5281
- (void) src0_ddq_i;
5282
- (void) i02;
5283
- (void) i1;
5284
5676
  }
5285
5677
 
5286
5678
  inline void ggml_cuda_op_mul(
5287
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5288
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5289
- cudaStream_t & cudaStream_main){
5679
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5680
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5290
5681
 
5291
- GGML_ASSERT(src0_ddf_i != nullptr);
5292
- GGML_ASSERT(src1_ddf_i != nullptr);
5293
- GGML_ASSERT(dst_ddf_i != nullptr);
5294
-
5295
- const int64_t ne00 = src0->ne[0];
5296
- const int64_t i01_diff = i01_high - i01_low;
5682
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5683
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5684
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5297
5685
 
5298
5686
  const int64_t ne10 = src1->ne[0];
5299
5687
  const int64_t ne11 = src1->ne[1];
5300
5688
 
5301
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5689
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5302
5690
 
5303
5691
  (void) dst;
5304
- (void) src0_ddq_i;
5305
- (void) i02;
5306
- (void) i1;
5307
5692
  }
5308
5693
 
5309
5694
  inline void ggml_cuda_op_gelu(
5310
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5311
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5312
- cudaStream_t & cudaStream_main){
5313
-
5314
- GGML_ASSERT(src0_ddf_i != nullptr);
5315
- GGML_ASSERT(dst_ddf_i != nullptr);
5695
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5696
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5316
5697
 
5317
- const int64_t ne00 = src0->ne[0];
5318
- const int64_t i01_diff = i01_high - i01_low;
5698
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5699
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5319
5700
 
5320
- // compute
5321
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5701
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5322
5702
 
5323
5703
  (void) src1;
5324
5704
  (void) dst;
5325
- (void) src0_ddq_i;
5326
- (void) src1_ddf_i;
5327
- (void) i02;
5328
- (void) i1;
5705
+ (void) src1_dd;
5329
5706
  }
5330
5707
 
5331
5708
  inline void ggml_cuda_op_silu(
5332
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5333
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5334
- cudaStream_t & cudaStream_main){
5335
-
5336
- GGML_ASSERT(src0_ddf_i != nullptr);
5337
- GGML_ASSERT(dst_ddf_i != nullptr);
5709
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5710
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5338
5711
 
5339
- const int64_t ne00 = src0->ne[0];
5340
- const int64_t i01_diff = i01_high - i01_low;
5712
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5713
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5341
5714
 
5342
- // compute
5343
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5715
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5344
5716
 
5345
5717
  (void) src1;
5346
5718
  (void) dst;
5347
- (void) src0_ddq_i;
5348
- (void) src1_ddf_i;
5349
- (void) i02;
5350
- (void) i1;
5719
+ (void) src1_dd;
5351
5720
  }
5352
5721
 
5353
5722
  inline void ggml_cuda_op_norm(
5354
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5355
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5356
- cudaStream_t & cudaStream_main){
5723
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5724
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5357
5725
 
5358
- GGML_ASSERT(src0_ddf_i != nullptr);
5359
- GGML_ASSERT(dst_ddf_i != nullptr);
5726
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5727
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5360
5728
 
5361
5729
  const int64_t ne00 = src0->ne[0];
5362
- const int64_t i01_diff = i01_high - i01_low;
5730
+ const int64_t nrows = ggml_nrows(src0);
5363
5731
 
5364
- // compute
5365
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5732
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5366
5733
 
5367
5734
  (void) src1;
5368
5735
  (void) dst;
5369
- (void) src0_ddq_i;
5370
- (void) src1_ddf_i;
5371
- (void) i02;
5372
- (void) i1;
5736
+ (void) src1_dd;
5373
5737
  }
5374
5738
 
5375
5739
  inline void ggml_cuda_op_rms_norm(
5376
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5377
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5378
- cudaStream_t & cudaStream_main){
5740
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5741
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5379
5742
 
5380
- GGML_ASSERT(src0_ddf_i != nullptr);
5381
- GGML_ASSERT(dst_ddf_i != nullptr);
5743
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5744
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5382
5745
 
5383
5746
  const int64_t ne00 = src0->ne[0];
5384
- const int64_t i01_diff = i01_high - i01_low;
5747
+ const int64_t nrows = ggml_nrows(src0);
5385
5748
 
5386
5749
  float eps;
5387
5750
  memcpy(&eps, dst->op_params, sizeof(float));
5388
5751
 
5389
- // compute
5390
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5752
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5391
5753
 
5392
5754
  (void) src1;
5393
5755
  (void) dst;
5394
- (void) src0_ddq_i;
5395
- (void) src1_ddf_i;
5396
- (void) i02;
5397
- (void) i1;
5756
+ (void) src1_dd;
5398
5757
  }
5399
5758
 
5400
5759
  inline void ggml_cuda_op_mul_mat_q(
5401
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5402
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5403
- cudaStream_t & cudaStream_main){
5404
-
5405
- GGML_ASSERT(src0_ddq_i != nullptr);
5406
- GGML_ASSERT(src1_ddf_i != nullptr);
5407
- GGML_ASSERT(dst_ddf_i != nullptr);
5760
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5761
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5762
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5408
5763
 
5409
5764
  const int64_t ne00 = src0->ne[0];
5410
5765
 
5411
5766
  const int64_t ne10 = src1->ne[0];
5412
- const int64_t ne11 = src1->ne[1];
5413
5767
  GGML_ASSERT(ne10 % QK8_1 == 0);
5414
5768
 
5415
5769
  const int64_t ne0 = dst->ne[0];
5416
5770
 
5417
- const int64_t i01_diff = i01_high - i01_low;
5771
+ const int64_t row_diff = row_high - row_low;
5418
5772
 
5419
5773
  int id;
5420
5774
  CUDA_CHECK(cudaGetDevice(&id));
5421
5775
 
5422
5776
  // the main device has a larger memory buffer to hold the results from all GPUs
5423
5777
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5424
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5425
-
5426
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5427
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5428
- size_t as;
5429
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5430
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5778
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5431
5779
 
5432
5780
  switch (src0->type) {
5433
5781
  case GGML_TYPE_Q4_0:
5434
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5782
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5435
5783
  break;
5436
5784
  case GGML_TYPE_Q4_1:
5437
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5785
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5438
5786
  break;
5439
5787
  case GGML_TYPE_Q5_0:
5440
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5788
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5441
5789
  break;
5442
5790
  case GGML_TYPE_Q5_1:
5443
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5791
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5444
5792
  break;
5445
5793
  case GGML_TYPE_Q8_0:
5446
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5794
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5447
5795
  break;
5448
5796
  case GGML_TYPE_Q2_K:
5449
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5797
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5450
5798
  break;
5451
5799
  case GGML_TYPE_Q3_K:
5452
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5800
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5453
5801
  break;
5454
5802
  case GGML_TYPE_Q4_K:
5455
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5803
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5456
5804
  break;
5457
5805
  case GGML_TYPE_Q5_K:
5458
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5806
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5459
5807
  break;
5460
5808
  case GGML_TYPE_Q6_K:
5461
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5809
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5462
5810
  break;
5463
5811
  default:
5464
5812
  GGML_ASSERT(false);
5465
5813
  break;
5466
5814
  }
5467
5815
 
5468
- ggml_cuda_pool_free(src1_q8_1, as);
5469
-
5470
5816
  (void) src1;
5471
5817
  (void) dst;
5472
- (void) src0_ddf_i;
5473
- (void) i02;
5474
- (void) i1;
5818
+ (void) src1_ddf_i;
5475
5819
  }
5476
5820
 
5477
5821
  static int64_t get_row_rounding(ggml_type type) {
5478
- int max_compute_capability = INT_MIN;
5479
- for (int id = 0; id < g_device_count; ++id) {
5480
- if (max_compute_capability < g_compute_capabilities[id]
5481
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5482
- max_compute_capability = g_compute_capabilities[id];
5822
+ int64_t min_compute_capability = INT_MAX;
5823
+ int64_t max_compute_capability = INT_MIN;
5824
+ for (int64_t id = 0; id < g_device_count; ++id) {
5825
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5826
+ if (min_compute_capability > g_compute_capabilities[id]) {
5827
+ min_compute_capability = g_compute_capabilities[id];
5828
+ }
5829
+ if (max_compute_capability < g_compute_capabilities[id]) {
5830
+ max_compute_capability = g_compute_capabilities[id];
5831
+ }
5483
5832
  }
5484
5833
  }
5485
5834
 
5835
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5836
+ switch(type) {
5837
+ case GGML_TYPE_Q4_0:
5838
+ case GGML_TYPE_Q4_1:
5839
+ case GGML_TYPE_Q5_0:
5840
+ case GGML_TYPE_Q5_1:
5841
+ case GGML_TYPE_Q8_0:
5842
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5843
+ case GGML_TYPE_F16:
5844
+ return 1;
5845
+ case GGML_TYPE_Q2_K:
5846
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5847
+ case GGML_TYPE_Q3_K:
5848
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5849
+ case GGML_TYPE_Q4_K:
5850
+ case GGML_TYPE_Q5_K:
5851
+ case GGML_TYPE_Q6_K:
5852
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5853
+ default:
5854
+ GGML_ASSERT(false);
5855
+ }
5856
+ #else
5486
5857
  switch(type) {
5487
5858
  case GGML_TYPE_Q4_0:
5488
5859
  case GGML_TYPE_Q4_1:
@@ -5503,170 +5874,147 @@ static int64_t get_row_rounding(ggml_type type) {
5503
5874
  default:
5504
5875
  GGML_ASSERT(false);
5505
5876
  }
5877
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5506
5878
  }
5507
5879
 
5508
- inline void ggml_cuda_op_mul_mat_vec(
5509
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5510
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5511
- cudaStream_t & cudaStream_main){
5512
-
5513
- GGML_ASSERT(src0_ddq_i != nullptr);
5514
- GGML_ASSERT(src1_ddf_i != nullptr);
5515
- GGML_ASSERT(dst_ddf_i != nullptr);
5880
+ inline void ggml_cuda_op_mul_mat_vec_q(
5881
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5882
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5883
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5516
5884
 
5517
5885
  const int64_t ne00 = src0->ne[0];
5518
- const int64_t nrows = i01_high - i01_low;
5886
+ const int64_t row_diff = row_high - row_low;
5519
5887
 
5520
- #ifdef GGML_CUDA_FORCE_DMMV
5521
- const bool use_mul_mat_vec_q = false;
5522
- (void) g_compute_capabilities[0];
5523
- #else
5524
- int id;
5525
- CUDA_CHECK(cudaGetDevice(&id));
5888
+ switch (src0->type) {
5889
+ case GGML_TYPE_Q4_0:
5890
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5891
+ break;
5892
+ case GGML_TYPE_Q4_1:
5893
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5894
+ break;
5895
+ case GGML_TYPE_Q5_0:
5896
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5897
+ break;
5898
+ case GGML_TYPE_Q5_1:
5899
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5900
+ break;
5901
+ case GGML_TYPE_Q8_0:
5902
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5903
+ break;
5904
+ case GGML_TYPE_Q2_K:
5905
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5906
+ break;
5907
+ case GGML_TYPE_Q3_K:
5908
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5909
+ break;
5910
+ case GGML_TYPE_Q4_K:
5911
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5912
+ break;
5913
+ case GGML_TYPE_Q5_K:
5914
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5915
+ break;
5916
+ case GGML_TYPE_Q6_K:
5917
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5918
+ break;
5919
+ default:
5920
+ GGML_ASSERT(false);
5921
+ break;
5922
+ }
5526
5923
 
5527
- bool mul_mat_vec_q_implemented =
5528
- src0->type == GGML_TYPE_Q4_0 ||
5529
- src0->type == GGML_TYPE_Q4_1 ||
5530
- src0->type == GGML_TYPE_Q5_0 ||
5531
- src0->type == GGML_TYPE_Q5_1 ||
5532
- src0->type == GGML_TYPE_Q8_0;
5533
- #if QK_K == 256
5534
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5535
- src0->type == GGML_TYPE_Q2_K ||
5536
- src0->type == GGML_TYPE_Q3_K ||
5537
- src0->type == GGML_TYPE_Q4_K ||
5538
- src0->type == GGML_TYPE_Q5_K ||
5539
- src0->type == GGML_TYPE_Q6_K;
5540
- #endif // QK_K == 256
5541
-
5542
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5543
- #endif
5924
+ (void) src1;
5925
+ (void) dst;
5926
+ (void) src1_ddf_i;
5927
+ (void) src1_ncols;
5928
+ (void) src1_padded_row_size;
5929
+ }
5544
5930
 
5545
- if (use_mul_mat_vec_q) {
5546
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5547
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5548
- size_t as;
5549
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5550
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5551
-
5552
- switch (src0->type) {
5553
- case GGML_TYPE_Q4_0:
5554
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q4_1:
5557
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_0:
5560
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q5_1:
5563
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q8_0:
5566
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q2_K:
5569
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q3_K:
5572
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q4_K:
5575
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q5_K:
5578
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_Q6_K:
5581
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5931
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5932
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5933
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5934
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5587
5935
 
5588
- ggml_cuda_pool_free(src1_q8_1, as);
5589
- } else {
5590
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5936
+ const int64_t ne00 = src0->ne[0];
5937
+ const int64_t row_diff = row_high - row_low;
5938
+
5939
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5591
5940
  #ifdef GGML_CUDA_F16
5592
- size_t ash;
5593
- dfloat * src1_dfloat = nullptr; // dfloat == half
5594
-
5595
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5596
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5597
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5598
-
5599
- if (src1_convert_f16) {
5600
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5601
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5602
- ne00, 1, sizeof(float), 0, 0,
5603
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5604
- }
5941
+ size_t ash;
5942
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5943
+
5944
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5945
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5946
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5947
+
5948
+ if (src1_convert_f16) {
5949
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5950
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5951
+ ne00, 1, sizeof(float), 0, 0,
5952
+ ne00, 1, sizeof(half), 0, 0, stream);
5953
+ }
5605
5954
  #else
5606
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5955
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5607
5956
  #endif // GGML_CUDA_F16
5608
5957
 
5609
- switch (src0->type) {
5610
- case GGML_TYPE_Q4_0:
5611
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5612
- break;
5613
- case GGML_TYPE_Q4_1:
5614
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5615
- break;
5616
- case GGML_TYPE_Q5_0:
5617
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5618
- break;
5619
- case GGML_TYPE_Q5_1:
5620
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5621
- break;
5622
- case GGML_TYPE_Q8_0:
5623
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5624
- break;
5625
- case GGML_TYPE_Q2_K:
5626
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5627
- break;
5628
- case GGML_TYPE_Q3_K:
5629
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5630
- break;
5631
- case GGML_TYPE_Q4_K:
5632
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5633
- break;
5634
- case GGML_TYPE_Q5_K:
5635
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5636
- break;
5637
- case GGML_TYPE_Q6_K:
5638
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5639
- break;
5640
- case GGML_TYPE_F16:
5641
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5642
- break;
5643
- default:
5644
- GGML_ASSERT(false);
5645
- break;
5646
- }
5958
+ switch (src0->type) {
5959
+ case GGML_TYPE_Q4_0:
5960
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5961
+ break;
5962
+ case GGML_TYPE_Q4_1:
5963
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5964
+ break;
5965
+ case GGML_TYPE_Q5_0:
5966
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5967
+ break;
5968
+ case GGML_TYPE_Q5_1:
5969
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5970
+ break;
5971
+ case GGML_TYPE_Q8_0:
5972
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5973
+ break;
5974
+ case GGML_TYPE_Q2_K:
5975
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5976
+ break;
5977
+ case GGML_TYPE_Q3_K:
5978
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5979
+ break;
5980
+ case GGML_TYPE_Q4_K:
5981
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5982
+ break;
5983
+ case GGML_TYPE_Q5_K:
5984
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5985
+ break;
5986
+ case GGML_TYPE_Q6_K:
5987
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5988
+ break;
5989
+ case GGML_TYPE_F16:
5990
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5991
+ break;
5992
+ default:
5993
+ GGML_ASSERT(false);
5994
+ break;
5995
+ }
5647
5996
 
5648
5997
  #ifdef GGML_CUDA_F16
5649
- if (src1_convert_f16) {
5650
- ggml_cuda_pool_free(src1_dfloat, ash);
5651
- }
5652
- #endif // GGML_CUDA_F16
5998
+ if (src1_convert_f16) {
5999
+ ggml_cuda_pool_free(src1_dfloat, ash);
5653
6000
  }
6001
+ #endif // GGML_CUDA_F16
5654
6002
 
5655
6003
  (void) src1;
5656
6004
  (void) dst;
5657
- (void) src0_ddf_i;
5658
- (void) i02;
5659
- (void) i1;
6005
+ (void) src1_ddq_i;
6006
+ (void) src1_ncols;
6007
+ (void) src1_padded_row_size;
5660
6008
  }
5661
6009
 
5662
6010
  inline void ggml_cuda_op_mul_mat_cublas(
5663
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5664
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5665
- cudaStream_t & cudaStream_main){
6011
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
6012
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6013
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5666
6014
 
5667
- GGML_ASSERT(src0_ddf_i != nullptr);
6015
+ GGML_ASSERT(src0_dd_i != nullptr);
5668
6016
  GGML_ASSERT(src1_ddf_i != nullptr);
5669
- GGML_ASSERT(dst_ddf_i != nullptr);
6017
+ GGML_ASSERT(dst_dd_i != nullptr);
5670
6018
 
5671
6019
  const float alpha = 1.0f;
5672
6020
  const float beta = 0.0f;
@@ -5674,43 +6022,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5674
6022
  const int64_t ne00 = src0->ne[0];
5675
6023
 
5676
6024
  const int64_t ne10 = src1->ne[0];
5677
- const int64_t ne11 = src1->ne[1];
5678
6025
 
5679
6026
  const int64_t ne0 = dst->ne[0];
5680
- const int64_t i01_diff = i01_high - i01_low;
6027
+ const int64_t row_diff = row_high - row_low;
6028
+
6029
+ float * src0_ddq_as_f32;
6030
+ size_t src0_as = 0;
6031
+
6032
+ if (src0->type != GGML_TYPE_F32) {
6033
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6034
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6035
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6036
+ }
6037
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5681
6038
 
5682
6039
  int id;
5683
6040
  CUDA_CHECK(cudaGetDevice(&id));
5684
6041
 
5685
6042
  // the main device has a larger memory buffer to hold the results from all GPUs
5686
6043
  // ldc == nrows of the matrix that cuBLAS writes into
5687
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6044
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5688
6045
 
5689
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6046
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5690
6047
  CUBLAS_CHECK(
5691
6048
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5692
- i01_diff, ne11, ne10,
6049
+ row_diff, src1_ncols, ne10,
5693
6050
  &alpha, src0_ddf_i, ne00,
5694
- src1_ddf_i, ne10,
5695
- &beta, dst_ddf_i, ldc));
6051
+ src1_ddf_i, ne10,
6052
+ &beta, dst_dd_i, ldc));
6053
+
6054
+ if (src0_as > 0) {
6055
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6056
+ }
5696
6057
 
5697
6058
  (void) dst;
5698
- (void) src0_ddq_i;
5699
- (void) i02;
5700
- (void) i1;
6059
+ (void) src1_ddq_i;
6060
+ (void) src1_padded_row_size;
5701
6061
  }
5702
6062
 
5703
6063
  inline void ggml_cuda_op_rope(
5704
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5705
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5706
- cudaStream_t & cudaStream_main){
6064
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6065
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5707
6066
 
5708
- GGML_ASSERT(src0_ddf_i != nullptr);
5709
- GGML_ASSERT(dst_ddf_i != nullptr);
6067
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6068
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5710
6069
 
5711
6070
  const int64_t ne00 = src0->ne[0];
5712
6071
  const int64_t ne01 = src0->ne[1];
5713
- const int64_t i01_diff = i01_high - i01_low;
6072
+ const int64_t nrows = ggml_nrows(src0);
5714
6073
 
5715
6074
  const int n_past = ((int32_t *) dst->op_params)[0];
5716
6075
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5723,44 +6082,37 @@ inline void ggml_cuda_op_rope(
5723
6082
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5724
6083
 
5725
6084
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6085
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5726
6086
 
5727
6087
  const bool is_neox = mode & 2;
5728
6088
  const bool is_glm = mode & 4;
5729
6089
 
5730
6090
  // compute
5731
6091
  if (is_glm) {
5732
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5733
- const float id_p = min(p, n_ctx - 2.f);
5734
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5735
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6092
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5736
6093
  } else if (is_neox) {
5737
6094
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5738
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5739
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6095
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5740
6096
  } else {
5741
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5742
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6097
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5743
6098
  }
5744
6099
 
5745
6100
  (void) src1;
5746
6101
  (void) dst;
5747
- (void) src0_ddq_i;
5748
- (void) src1_ddf_i;
5749
- (void) i1;
6102
+ (void) src1_dd;
5750
6103
  }
5751
6104
 
5752
6105
  inline void ggml_cuda_op_alibi(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6106
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6107
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6108
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6109
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6110
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6111
 
5760
6112
  const int64_t ne00 = src0->ne[0];
5761
6113
  const int64_t ne01 = src0->ne[1];
5762
6114
  const int64_t ne02 = src0->ne[2];
5763
- const int64_t i01_diff = i01_high - i01_low;
6115
+ const int64_t nrows = ggml_nrows(src0);
5764
6116
 
5765
6117
  const int n_past = ((int32_t *) dst->op_params)[0];
5766
6118
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5775,334 +6127,393 @@ inline void ggml_cuda_op_alibi(
5775
6127
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5776
6128
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5777
6129
 
5778
- // compute
5779
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6130
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5780
6131
 
5781
6132
  (void) src1;
5782
- (void) src0_ddq_i;
5783
- (void) src1_ddf_i;
5784
- (void) i1;
6133
+ (void) src1_dd;
5785
6134
  }
5786
6135
 
5787
6136
  inline void ggml_cuda_op_diag_mask_inf(
5788
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5789
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5790
- cudaStream_t & cudaStream_main){
6137
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6138
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5791
6139
 
5792
- GGML_ASSERT(src0_ddf_i != nullptr);
5793
- GGML_ASSERT(dst_ddf_i != nullptr);
6140
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6141
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5794
6142
 
5795
6143
  const int64_t ne00 = src0->ne[0];
5796
6144
  const int64_t ne01 = src0->ne[1];
5797
- const int64_t i01_diff = i01_high - i01_low;
6145
+ const int nrows0 = ggml_nrows(src0);
5798
6146
 
5799
6147
  const int n_past = ((int32_t *) dst->op_params)[0];
5800
6148
 
5801
- // compute
5802
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6149
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5803
6150
 
5804
6151
  (void) src1;
5805
6152
  (void) dst;
5806
- (void) src0_ddq_i;
5807
- (void) src1_ddf_i;
5808
- (void) i02;
5809
- (void) i1;
6153
+ (void) src1_dd;
5810
6154
  }
5811
6155
 
5812
6156
  inline void ggml_cuda_op_soft_max(
5813
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5814
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5815
- cudaStream_t & cudaStream_main){
6157
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6158
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5816
6159
 
5817
- GGML_ASSERT(src0_ddf_i != nullptr);
5818
- GGML_ASSERT(dst_ddf_i != nullptr);
6160
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6161
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5819
6162
 
5820
6163
  const int64_t ne00 = src0->ne[0];
5821
- const int64_t i01_diff = i01_high - i01_low;
6164
+ const int64_t nrows = ggml_nrows(src0);
5822
6165
 
5823
- // compute
5824
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6166
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5825
6167
 
5826
6168
  (void) src1;
5827
6169
  (void) dst;
5828
- (void) src0_ddq_i;
5829
- (void) src1_ddf_i;
5830
- (void) i02;
5831
- (void) i1;
6170
+ (void) src1_dd;
5832
6171
  }
5833
6172
 
5834
6173
  inline void ggml_cuda_op_scale(
5835
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5836
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5837
- cudaStream_t & cudaStream_main){
6174
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6175
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5838
6176
 
5839
- GGML_ASSERT(src0_ddf_i != nullptr);
5840
- GGML_ASSERT(dst_ddf_i != nullptr);
6177
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6178
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6179
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5841
6180
 
5842
6181
  const float scale = ((float *) src1->data)[0];
5843
6182
 
5844
- const int64_t ne00 = src0->ne[0];
5845
- const int64_t i01_diff = i01_high - i01_low;
5846
-
5847
- // compute
5848
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6183
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5849
6184
  CUDA_CHECK(cudaGetLastError());
5850
6185
 
5851
6186
  (void) src1;
5852
6187
  (void) dst;
5853
- (void) src0_ddq_i;
5854
- (void) src1_ddf_i;
5855
- (void) i02;
5856
- (void) i1;
6188
+ (void) src1_dd;
6189
+ }
6190
+
6191
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6192
+ const int64_t nrows0 = ggml_nrows(src0);
6193
+
6194
+ const bool use_src1 = src1 != nullptr;
6195
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6196
+
6197
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6198
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6199
+
6200
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6201
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6202
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6203
+
6204
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6205
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6206
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6207
+
6208
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6209
+
6210
+ // dd = data device
6211
+ float * src0_ddf = nullptr;
6212
+ float * src1_ddf = nullptr;
6213
+ float * dst_ddf = nullptr;
6214
+
6215
+ // as = actual size
6216
+ size_t src0_asf = 0;
6217
+ size_t src1_asf = 0;
6218
+ size_t dst_asf = 0;
6219
+
6220
+ ggml_cuda_set_device(g_main_device);
6221
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6222
+
6223
+ if (src0_on_device) {
6224
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6225
+ } else {
6226
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6227
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6228
+ }
6229
+
6230
+ if (use_src1 && !src1_stays_on_host) {
6231
+ if (src1_on_device) {
6232
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6233
+ } else {
6234
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6235
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6236
+ }
6237
+ }
6238
+ if (dst_on_device) {
6239
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6240
+ } else {
6241
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6242
+ }
6243
+
6244
+ // do the computation
6245
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6246
+ CUDA_CHECK(cudaGetLastError());
6247
+
6248
+ // copy dst to host if necessary
6249
+ if (!dst_on_device) {
6250
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6251
+ }
6252
+
6253
+ if (src0_asf > 0) {
6254
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6255
+ }
6256
+ if (src1_asf > 0) {
6257
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6258
+ }
6259
+ if (dst_asf > 0) {
6260
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6261
+ }
6262
+
6263
+ if (dst->backend == GGML_BACKEND_CPU) {
6264
+ CUDA_CHECK(cudaDeviceSynchronize());
6265
+ }
6266
+ }
6267
+
6268
+ void ggml_cuda_set_peer_access(const int n_tokens) {
6269
+ static bool peer_access_enabled = false;
6270
+
6271
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6272
+
6273
+ if (peer_access_enabled == enable_peer_access) {
6274
+ return;
6275
+ }
6276
+
6277
+ #ifdef NDEBUG
6278
+ for (int id = 0; id < g_device_count; ++id) {
6279
+ CUDA_CHECK(ggml_cuda_set_device(id));
6280
+
6281
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6282
+ if (id == id_other) {
6283
+ continue;
6284
+ }
6285
+ if (id != g_main_device && id_other != g_main_device) {
6286
+ continue;
6287
+ }
6288
+
6289
+ int can_access_peer;
6290
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6291
+ if (can_access_peer) {
6292
+ if (enable_peer_access) {
6293
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6294
+ } else {
6295
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6296
+ }
6297
+ }
6298
+ }
6299
+ }
6300
+ #endif // NDEBUG
6301
+
6302
+ peer_access_enabled = enable_peer_access;
5857
6303
  }
5858
6304
 
5859
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5860
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6305
+ static void ggml_cuda_op_mul_mat(
6306
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6307
+ const bool convert_src1_to_q8_1) {
6308
+
5861
6309
  const int64_t ne00 = src0->ne[0];
5862
6310
  const int64_t ne01 = src0->ne[1];
5863
6311
  const int64_t ne02 = src0->ne[2];
5864
6312
  const int64_t ne03 = src0->ne[3];
5865
6313
  const int64_t nrows0 = ggml_nrows(src0);
5866
6314
 
5867
- const bool use_src1 = src1 != nullptr;
5868
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5869
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5870
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5871
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5872
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6315
+ const int64_t ne10 = src1->ne[0];
6316
+ const int64_t ne11 = src1->ne[1];
6317
+ const int64_t ne12 = src1->ne[2];
6318
+ const int64_t ne13 = src1->ne[3];
6319
+ const int64_t nrows1 = ggml_nrows(src1);
5873
6320
 
5874
6321
  GGML_ASSERT(ne03 == ne13);
5875
6322
 
5876
6323
  const int64_t ne0 = dst->ne[0];
5877
6324
  const int64_t ne1 = dst->ne[1];
5878
6325
 
5879
- const int nb2 = dst->nb[2];
5880
- const int nb3 = dst->nb[3];
6326
+ const int nb2 = dst->nb[2];
6327
+ const int nb3 = dst->nb[3];
6328
+
6329
+ ggml_cuda_set_peer_access(ne11);
5881
6330
 
5882
6331
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5883
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6332
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5884
6333
 
5885
- // strides for iteration over dims 3 and 2
5886
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5887
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5888
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5889
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5890
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5891
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6334
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5892
6335
 
5893
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5894
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5895
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5896
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5897
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6336
+ const int64_t i02_divisor = ne12 / ne02;
5898
6337
 
5899
6338
  const size_t src0_ts = ggml_type_size(src0->type);
5900
6339
  const size_t src0_bs = ggml_blck_size(src0->type);
6340
+ const size_t q8_1_ts = sizeof(block_q8_1);
6341
+ const size_t q8_1_bs = QK8_1;
5901
6342
 
5902
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5903
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5904
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6343
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6344
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6345
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5905
6346
 
5906
6347
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5907
6348
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5908
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5909
6349
 
5910
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5911
- const bool src1_stays_on_host = use_src1 && (
5912
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6350
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6351
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6352
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5913
6353
 
5914
6354
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6355
+ GGML_ASSERT(!(split && ne02 > 1));
6356
+ GGML_ASSERT(!(split && ne03 > 1));
5915
6357
  GGML_ASSERT(!(split && ne02 < ne12));
5916
6358
 
5917
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5918
-
5919
6359
  // dd = data device
5920
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5921
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5922
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5923
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5924
-
5925
- // asq = actual size quantized, asf = actual size float
5926
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5927
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
5928
- size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5929
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6360
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6361
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6362
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6363
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
5930
6364
 
5931
- // if multiple devices are used they need to wait for the main device
5932
- // here an event is recorded that signifies that the main device has finished calculating the input data
5933
- if (split && g_device_count > 1) {
5934
- CUDA_CHECK(cudaSetDevice(g_main_device));
5935
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5936
- }
6365
+ // as = actual size
6366
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
6367
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
6368
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6369
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5937
6370
 
5938
- for (int id = 0; id < g_device_count; ++id) {
5939
- if (!split && id != g_main_device) {
5940
- continue;
5941
- }
6371
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6372
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5942
6373
 
5943
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5944
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6374
+ for (int64_t id = 0; id < g_device_count; ++id) {
6375
+ // by default, use all rows
6376
+ row_low[id] = 0;
6377
+ row_high[id] = ne01;
5945
6378
 
5946
- int64_t row_low, row_high;
6379
+ // for multi GPU, get the row boundaries from tensor split
6380
+ // and round to mul_mat_q tile sizes
5947
6381
  if (split) {
5948
6382
  const int64_t rounding = get_row_rounding(src0->type);
5949
6383
 
5950
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5951
- row_low -= row_low % rounding;
6384
+ if (id != 0) {
6385
+ row_low[id] = ne01*g_tensor_split[id];
6386
+ row_low[id] -= row_low[id] % rounding;
6387
+ }
5952
6388
 
5953
- if (id == g_device_count - 1) {
5954
- row_high = nrows0;
5955
- } else {
5956
- row_high = nrows0*g_tensor_split[id + 1];
5957
- row_high -= row_high % rounding;
6389
+ if (id != g_device_count - 1) {
6390
+ row_high[id] = ne01*g_tensor_split[id + 1];
6391
+ row_high[id] -= row_high[id] % rounding;
5958
6392
  }
5959
- } else {
5960
- row_low = 0;
5961
- row_high = nrows0*i02_divisor;
5962
6393
  }
5963
- if (row_low == row_high) {
6394
+ }
6395
+
6396
+ for (int64_t id = 0; id < g_device_count; ++id) {
6397
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5964
6398
  continue;
5965
6399
  }
5966
6400
 
5967
- int64_t row_diff = row_high - row_low;
5968
-
5969
- cudaSetDevice(id);
5970
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6401
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6402
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5971
6403
 
5972
- // wait for main GPU data if necessary
5973
- if (split && id != g_main_device) {
5974
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5975
- }
6404
+ ggml_cuda_set_device(id);
6405
+ const cudaStream_t stream = g_cudaStreams[id][0];
5976
6406
 
5977
6407
  if (src0_on_device && src0_is_contiguous) {
5978
- if (src0_is_f32) {
5979
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5980
- } else {
5981
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5982
- }
6408
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5983
6409
  } else {
5984
- if (src0_is_f32) {
5985
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5986
- } else {
5987
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5988
- }
6410
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6411
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5989
6412
  }
5990
6413
 
5991
- if (src0_needs_f32 && !src0_is_f32) {
5992
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6414
+ if (src1_on_device && src1_is_contiguous) {
6415
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6416
+ } else {
6417
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5993
6418
  }
5994
6419
 
5995
- if (use_src1 && !src1_stays_on_host) {
5996
- if (src1_on_device && src1_is_contiguous) {
5997
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5998
- } else {
5999
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6420
+ if (convert_src1_to_q8_1) {
6421
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6422
+
6423
+ if (split && src1_on_device && src1_is_contiguous) {
6424
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6425
+ CUDA_CHECK(cudaGetLastError());
6000
6426
  }
6001
6427
  }
6428
+
6002
6429
  if (dst_on_device) {
6003
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6430
+ dst_dd[id] = (float *) dst_extra->data_device[id];
6004
6431
  } else {
6005
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
6006
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6432
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6433
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
6007
6434
  }
6435
+ }
6436
+
6437
+ // if multiple devices are used they need to wait for the main device
6438
+ // here an event is recorded that signals that the main device has finished calculating the input data
6439
+ if (split && g_device_count > 1) {
6440
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6441
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6442
+ }
6008
6443
 
6009
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
6010
- const int64_t i13 = i03 % ne13;
6011
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
6012
- const int64_t i12 = i02 % ne12;
6444
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6445
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6446
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6447
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
6013
6448
 
6014
- const int64_t i0 = i03*i02_max + i02;
6449
+ for (int64_t id = 0; id < g_device_count; ++id) {
6450
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6451
+ continue;
6452
+ }
6015
6453
 
6016
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6017
- const int64_t i0_offset_low = row_low/rows_per_iter;
6018
- const int64_t i0_offset_high = row_high/rows_per_iter;
6454
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6455
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6456
+ const int64_t row_diff = row_high[id] - row_low[id];
6019
6457
 
6020
- int64_t i01_low = 0;
6021
- int64_t i01_high = rows_per_iter;
6022
- if (split) {
6023
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
6024
- continue;
6025
- }
6026
- if (i0 == i0_offset_low) {
6027
- i01_low = row_low % rows_per_iter;
6028
- }
6029
- if (i0 == i0_offset_high) {
6030
- i01_high = row_high % rows_per_iter;
6031
- }
6032
- }
6458
+ ggml_cuda_set_device(id);
6459
+ const cudaStream_t stream = g_cudaStreams[id][is];
6460
+
6461
+ // wait for main GPU data if necessary
6462
+ if (split && (id != g_main_device || is != 0)) {
6463
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6464
+ }
6033
6465
 
6034
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6035
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6036
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6037
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6038
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
6039
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6466
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6467
+ const int64_t i03 = i0 / ne12;
6468
+ const int64_t i02 = i0 % ne12;
6040
6469
 
6041
- const int64_t i01_diff = i01_high - i01_low;
6042
- if (i01_diff == 0) {
6043
- continue;
6044
- }
6045
- const int64_t i11 = i13*ne12 + i12;
6470
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
6046
6471
 
6047
6472
  // for split tensors the data begins at i0 == i0_offset_low
6048
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6049
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6050
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6051
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6052
-
6053
- // for split tensors the data pointer needs to be rounded down
6054
- // to the bin edge for i03, i02 bins beyond the first
6055
- if (i0 - i0_offset_low > 0) {
6056
- GGML_ASSERT(!flatten_rows);
6057
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6058
- src0_ddf_i -= (row_low % ne01)*ne00;
6059
- dst_ddf_i -= (row_low % ne0)*ne1;
6060
- }
6473
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6474
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6475
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6476
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6061
6477
 
6062
6478
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6063
6479
  // in that case an offset on dst_ddf_i is needed
6064
6480
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6065
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6481
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6066
6482
  }
6067
6483
 
6068
6484
  // copy src0, src1 to device if necessary
6069
- if (use_src1 && !src1_stays_on_host) {
6070
- if (src1->backend == GGML_BACKEND_CPU) {
6071
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6072
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6073
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6074
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6075
- if (id != g_main_device) {
6076
- GGML_ASSERT(!flatten_rows);
6485
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6486
+ if (id != g_main_device) {
6487
+ if (convert_src1_to_q8_1) {
6488
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6489
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6490
+ cudaMemcpyDeviceToDevice, stream));
6491
+ } else {
6077
6492
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6078
- src1_ddf_i_source += i11*src1_stride;
6079
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6080
- cudaMemcpyDeviceToDevice, cudaStream_main));
6493
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6494
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6495
+ cudaMemcpyDeviceToDevice, stream));
6081
6496
  }
6082
- } else if (src1_on_device && !src1_is_contiguous) {
6083
- GGML_ASSERT(!split);
6084
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6085
- } else {
6086
- GGML_ASSERT(false);
6087
6497
  }
6498
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6499
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6500
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6501
+ } else {
6502
+ GGML_ASSERT(false);
6088
6503
  }
6089
6504
 
6090
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6091
- if (src0_is_f32) {
6092
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6093
- } else {
6094
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6095
- }
6505
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6506
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6507
+ CUDA_CHECK(cudaGetLastError());
6096
6508
  }
6097
6509
 
6098
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6099
- if (src0_needs_f32 && !src0_is_f32) {
6100
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6101
- CUDA_CHECK(cudaGetLastError());
6510
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6511
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6102
6512
  }
6103
6513
 
6104
6514
  // do the computation
6105
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6515
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6516
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6106
6517
  CUDA_CHECK(cudaGetLastError());
6107
6518
 
6108
6519
  // copy dst to host or other device if necessary
@@ -6124,95 +6535,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6124
6535
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6125
6536
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6126
6537
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6127
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6128
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6129
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6538
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6539
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6540
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6541
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6542
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6130
6543
  } else {
6131
6544
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6132
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6545
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6546
+ dhf_dst_i += src1_col_0*ne0;
6547
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6133
6548
  }
6134
6549
  }
6135
6550
 
6136
- // signify to main device that other device is done
6137
- if (split && g_device_count > 1 && id != g_main_device) {
6138
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6551
+ // add event for the main device to wait on until other device is done
6552
+ if (split && (id != g_main_device || is != 0)) {
6553
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6139
6554
  }
6140
6555
  }
6141
6556
  }
6142
6557
  }
6143
6558
 
6144
- // wait until each device is finished, then free their buffers
6145
- for (int id = 0; id < g_device_count; ++id) {
6146
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6147
- continue;
6148
- }
6149
-
6150
- CUDA_CHECK(cudaSetDevice(id));
6559
+ for (int64_t id = 0; id < g_device_count; ++id) {
6560
+ CUDA_CHECK(ggml_cuda_set_device(id));
6151
6561
 
6152
- if (src0_asq[id] > 0) {
6153
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6154
- }
6155
- if (src0_asf[id] > 0) {
6156
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6562
+ // free buffers again when done
6563
+ if (src0_as[id] > 0) {
6564
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6157
6565
  }
6158
6566
  if (src1_asf[id] > 0) {
6159
6567
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6160
6568
  }
6161
- if (dst_asf[id] > 0) {
6162
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6569
+ if (src1_asq[id] > 0) {
6570
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6571
+ }
6572
+ if (dst_as[id] > 0) {
6573
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6163
6574
  }
6164
6575
  }
6165
6576
 
6166
6577
  // main device waits for all other devices to be finished
6167
6578
  if (split && g_device_count > 1) {
6168
- CUDA_CHECK(cudaSetDevice(g_main_device));
6169
- for (int id = 0; id < g_device_count; ++id) {
6170
- if (id != g_main_device && src0_extra->events[id]) {
6171
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6579
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6580
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6581
+
6582
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6583
+ for (int64_t id = 0; id < g_device_count; ++id) {
6584
+ for (int64_t is = 0; is < is_max; ++is) {
6585
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6172
6586
  }
6173
6587
  }
6174
6588
  }
6175
6589
 
6176
6590
  if (dst->backend == GGML_BACKEND_CPU) {
6177
- CUDA_CHECK(cudaSetDevice(g_main_device));
6591
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6178
6592
  CUDA_CHECK(cudaDeviceSynchronize());
6179
6593
  }
6180
6594
  }
6181
6595
 
6182
6596
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6183
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6184
- // Due to flatten_rows == true this does in practice not make a difference however.
6185
- // Better solution would be nice but right now that would require disproportionate changes.
6186
- GGML_ASSERT(
6187
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6188
- src1->type == GGML_TYPE_F32 &&
6189
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6190
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6597
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6191
6598
  }
6192
6599
 
6193
6600
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6194
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6195
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6601
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6196
6602
  }
6197
6603
 
6198
6604
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6199
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6200
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6605
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6201
6606
  }
6202
6607
 
6203
6608
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6204
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6205
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6609
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6206
6610
  }
6207
6611
 
6208
6612
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6209
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6210
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6613
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6211
6614
  }
6212
6615
 
6213
6616
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6214
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6215
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6617
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6216
6618
  }
6217
6619
 
6218
6620
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6246,8 +6648,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6246
6648
 
6247
6649
  const int64_t ne12 = src1->ne[2];
6248
6650
 
6249
- CUDA_CHECK(cudaSetDevice(g_main_device));
6250
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6651
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6652
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6251
6653
 
6252
6654
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6253
6655
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6258,7 +6660,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6258
6660
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6259
6661
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6260
6662
 
6261
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6663
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6262
6664
  }
6263
6665
 
6264
6666
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6277,8 +6679,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6277
6679
  const int64_t nb01 = src0->nb[1];
6278
6680
  const int64_t nb02 = src0->nb[2];
6279
6681
 
6280
- CUDA_CHECK(cudaSetDevice(g_main_device));
6281
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6682
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6683
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6282
6684
 
6283
6685
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6284
6686
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6289,38 +6691,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6289
6691
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6290
6692
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6291
6693
 
6292
- const int row_stride_x = nb01 / sizeof(half);
6293
- const int channel_stride_x = nb02 / sizeof(half);
6694
+ const int64_t row_stride_x = nb01 / sizeof(half);
6695
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6294
6696
 
6295
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6697
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6296
6698
  }
6297
6699
 
6298
6700
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6299
6701
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6300
6702
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6301
6703
 
6704
+ int64_t min_compute_capability = INT_MAX;
6705
+ for (int64_t id = 0; id < g_device_count; ++id) {
6706
+ if (min_compute_capability > g_compute_capabilities[id]
6707
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6708
+ min_compute_capability = g_compute_capabilities[id];
6709
+ }
6710
+ }
6711
+
6302
6712
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6303
6713
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6304
6714
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6305
6715
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6306
6716
  }else if (src0->type == GGML_TYPE_F32) {
6307
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6717
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6308
6718
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6309
6719
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6310
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6311
- } else {
6312
- int min_compute_capability = INT_MAX;
6313
- for (int id = 0; id < g_device_count; ++id) {
6314
- if (min_compute_capability > g_compute_capabilities[id]
6315
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6316
- min_compute_capability = g_compute_capabilities[id];
6317
- }
6318
- }
6319
6720
 
6721
+ #ifdef GGML_CUDA_FORCE_DMMV
6722
+ const bool use_mul_mat_vec_q = false;
6723
+ #else
6724
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6725
+ #endif // GGML_CUDA_FORCE_DMMV
6726
+
6727
+ if (use_mul_mat_vec_q) {
6728
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6729
+ } else {
6730
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6731
+ }
6732
+ } else {
6320
6733
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6321
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6734
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6322
6735
  } else {
6323
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6736
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6324
6737
  }
6325
6738
  }
6326
6739
  } else {
@@ -6329,8 +6742,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6329
6742
  }
6330
6743
 
6331
6744
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6332
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6333
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6745
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6334
6746
  }
6335
6747
 
6336
6748
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6359,8 +6771,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6359
6771
  const int64_t nb11 = src1->nb[1];
6360
6772
  const int64_t nb12 = src1->nb[2];
6361
6773
 
6362
- CUDA_CHECK(cudaSetDevice(g_main_device));
6363
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6774
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6775
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6364
6776
 
6365
6777
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
6778
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6370,10 +6782,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6370
6782
 
6371
6783
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6372
6784
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6373
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6785
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6374
6786
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6375
6787
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6376
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6788
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6377
6789
  } else {
6378
6790
  GGML_ASSERT(false);
6379
6791
  }
@@ -6387,28 +6799,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6387
6799
  }
6388
6800
 
6389
6801
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6390
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6391
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6802
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6392
6803
  }
6393
6804
 
6394
6805
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6395
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6396
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6806
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6397
6807
  }
6398
6808
 
6399
6809
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6400
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
6810
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6402
-
6403
- const int mode = ((int32_t *) dst->op_params)[2];
6404
- const bool is_glm = mode & 4;
6405
-
6406
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6811
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6407
6812
  }
6408
6813
 
6409
6814
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6410
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6411
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6815
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6412
6816
  }
6413
6817
 
6414
6818
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6418,7 +6822,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6418
6822
  }
6419
6823
 
6420
6824
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6421
- int nrows = ggml_nrows(tensor);
6825
+ const int64_t nrows = ggml_nrows(tensor);
6422
6826
 
6423
6827
  const int64_t ne0 = tensor->ne[0];
6424
6828
 
@@ -6428,14 +6832,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6428
6832
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6429
6833
  memset(extra, 0, sizeof(*extra));
6430
6834
 
6431
- for (int id = 0; id < g_device_count; ++id) {
6835
+ for (int64_t id = 0; id < g_device_count; ++id) {
6432
6836
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6433
6837
  continue;
6434
6838
  }
6435
6839
 
6436
- cudaSetDevice(id);
6840
+ ggml_cuda_set_device(id);
6437
6841
 
6438
- int row_low, row_high;
6842
+ int64_t row_low, row_high;
6439
6843
  if (backend == GGML_BACKEND_GPU) {
6440
6844
  row_low = 0;
6441
6845
  row_high = nrows;
@@ -6485,7 +6889,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6485
6889
  extra->data_device[id] = buf;
6486
6890
 
6487
6891
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6488
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6892
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6893
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6894
+ }
6489
6895
  }
6490
6896
  }
6491
6897
 
@@ -6499,15 +6905,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6499
6905
 
6500
6906
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6501
6907
 
6502
- for (int id = 0; id < g_device_count; ++id) {
6908
+ for (int64_t id = 0; id < g_device_count; ++id) {
6503
6909
  if (extra->data_device[id] != nullptr) {
6504
- CUDA_CHECK(cudaSetDevice(id));
6910
+ CUDA_CHECK(ggml_cuda_set_device(id));
6505
6911
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6506
6912
  }
6507
6913
 
6508
- if (extra->events[id] != nullptr) {
6509
- CUDA_CHECK(cudaSetDevice(id));
6510
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6914
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6915
+ if (extra->events[id][is] != nullptr) {
6916
+ CUDA_CHECK(ggml_cuda_set_device(id));
6917
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6918
+ }
6511
6919
  }
6512
6920
  }
6513
6921
 
@@ -6559,7 +6967,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6559
6967
  force_inplace;
6560
6968
  const size_t size = ggml_nbytes(tensor);
6561
6969
 
6562
- CUDA_CHECK(cudaSetDevice(g_main_device));
6970
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6563
6971
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6564
6972
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6565
6973
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
@@ -6608,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6608
7016
  return;
6609
7017
  }
6610
7018
  if (g_scratch_buffer == nullptr) {
7019
+ ggml_cuda_set_device(g_main_device);
6611
7020
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6612
7021
  }
6613
7022
 
@@ -6647,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6647
7056
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6648
7057
  }
6649
7058
 
6650
- void ggml_cuda_set_main_device(int main_device) {
7059
+ void ggml_cuda_set_main_device(const int main_device) {
6651
7060
  if (main_device >= g_device_count) {
6652
7061
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
6653
7062
  main_device, g_device_count, g_main_device);
@@ -6661,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
6661
7070
  }
6662
7071
  }
6663
7072
 
6664
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7073
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
6665
7074
  g_mul_mat_q = mul_mat_q;
6666
7075
  }
6667
7076
 
6668
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7077
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
6669
7078
  g_scratch_size = scratch_size;
6670
7079
  }
6671
7080