llama_cpp 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -68,19 +68,29 @@
68
68
  #include <cuda_runtime.h>
69
69
  #include <cublas_v2.h>
70
70
  #include <cuda_fp16.h>
71
- #endif
71
+ #endif // defined(GGML_USE_HIPBLAS)
72
72
 
73
73
  #include "ggml-cuda.h"
74
74
  #include "ggml.h"
75
75
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
76
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #define CC_TURING 700
78
+ #define CC_OFFSET_AMD 1000000
79
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
80
 
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
85
+ defined(__gfx1150__) || defined(__gfx1151__)
86
+ #define RDNA3
87
+ #endif
88
+
89
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
90
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
91
+ #define RDNA2
92
+ #endif
93
+
84
94
  #ifndef __has_builtin
85
95
  #define __has_builtin(x) 0
86
96
  #endif
@@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
132
142
  #endif
133
143
  return c;
134
144
  }
135
- #endif
145
+ #endif // defined(GGML_USE_HIPBLAS)
136
146
 
137
147
  #if defined(_MSC_VER)
138
148
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -144,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
144
154
  do { \
145
155
  cudaError_t err_ = (err); \
146
156
  if (err_ != cudaSuccess) { \
147
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
157
+ int id; \
158
+ cudaGetDevice(&id); \
159
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
148
160
  cudaGetErrorString(err_)); \
161
+ fprintf(stderr, "current device: %d\n", id); \
149
162
  exit(1); \
150
163
  } \
151
164
  } while (0)
@@ -155,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
155
168
  do { \
156
169
  cublasStatus_t err_ = (err); \
157
170
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
171
+ int id; \
172
+ cudaGetDevice(&id); \
158
173
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
159
174
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
175
+ fprintf(stderr, "current device: %d\n", id); \
160
176
  exit(1); \
161
177
  } \
162
178
  } while (0)
@@ -165,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
165
181
  do { \
166
182
  cublasStatus_t err_ = (err); \
167
183
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
184
+ int id; \
185
+ cudaGetDevice(&id); \
168
186
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
187
+ fprintf(stderr, "current device: %d\n", id); \
169
188
  exit(1); \
170
189
  } \
171
190
  } while (0)
@@ -212,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
212
231
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
213
232
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
214
233
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
215
- typedef void (*ggml_cuda_op_t)(
216
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
217
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
218
- cudaStream_t & cudaStream_main);
234
+ typedef void (*ggml_cuda_op_mul_mat_t)(
235
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
236
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
237
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
238
+ typedef void (*ggml_cuda_op_flatten_t)(
239
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
240
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
219
241
 
220
242
  // QK = number of values after dequantization
221
243
  // QR = QK / number of values before dequantization
@@ -396,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
396
418
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
397
419
  #endif
398
420
 
421
+ #define MUL_MAT_SRC1_COL_STRIDE 128
422
+
423
+ #define MAX_STREAMS 8
424
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
425
+
399
426
  struct ggml_tensor_extra_gpu {
400
427
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
401
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
428
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
402
429
  };
403
430
 
431
+ // this is faster on Windows
432
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
433
+ inline cudaError_t ggml_cuda_set_device(const int device) {
434
+ int current_device;
435
+ CUDA_CHECK(cudaGetDevice(&current_device));
436
+
437
+ if (device == current_device) {
438
+ return cudaSuccess;
439
+ }
440
+
441
+ return cudaSetDevice(device);
442
+ }
443
+
404
444
  static int g_device_count = -1;
405
445
  static int g_main_device = 0;
406
446
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -413,8 +453,6 @@ static size_t g_scratch_offset = 0;
413
453
 
414
454
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
415
455
 
416
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
417
-
418
456
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
419
457
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
420
458
 
@@ -3444,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
3444
3482
  }
3445
3483
  }
3446
3484
 
3485
+ #define MMQ_X_Q4_0_RDNA2 64
3486
+ #define MMQ_Y_Q4_0_RDNA2 128
3487
+ #define NWARPS_Q4_0_RDNA2 8
3488
+ #define MMQ_X_Q4_0_RDNA1 64
3489
+ #define MMQ_Y_Q4_0_RDNA1 64
3490
+ #define NWARPS_Q4_0_RDNA1 8
3447
3491
  #define MMQ_X_Q4_0_AMPERE 64
3448
3492
  #define MMQ_Y_Q4_0_AMPERE 128
3449
3493
  #define NWARPS_Q4_0_AMPERE 4
@@ -3451,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
3451
3495
  #define MMQ_Y_Q4_0_PASCAL 64
3452
3496
  #define NWARPS_Q4_0_PASCAL 8
3453
3497
 
3454
- template <bool need_check> static __global__ void mul_mat_q4_0(
3498
+ template <bool need_check> static __global__ void
3499
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3500
+ #if defined(RDNA3) || defined(RDNA2)
3501
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3502
+ #endif // defined(RDNA3) || defined(RDNA2)
3503
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3504
+ mul_mat_q4_0(
3455
3505
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3456
3506
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3457
3507
 
3458
- #if __CUDA_ARCH__ >= CC_TURING
3508
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3509
+ #if defined(RDNA3) || defined(RDNA2)
3510
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3511
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3512
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3513
+ #else
3514
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3515
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3516
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3517
+ #endif // defined(RDNA3) || defined(RDNA2)
3518
+
3519
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3520
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3521
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3522
+
3523
+ #elif __CUDA_ARCH__ >= CC_TURING
3459
3524
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3460
3525
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3461
3526
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3478,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3478
3543
  #endif // __CUDA_ARCH__ >= CC_TURING
3479
3544
  }
3480
3545
 
3546
+ #define MMQ_X_Q4_1_RDNA2 64
3547
+ #define MMQ_Y_Q4_1_RDNA2 128
3548
+ #define NWARPS_Q4_1_RDNA2 8
3549
+ #define MMQ_X_Q4_1_RDNA1 64
3550
+ #define MMQ_Y_Q4_1_RDNA1 64
3551
+ #define NWARPS_Q4_1_RDNA1 8
3481
3552
  #define MMQ_X_Q4_1_AMPERE 64
3482
3553
  #define MMQ_Y_Q4_1_AMPERE 128
3483
3554
  #define NWARPS_Q4_1_AMPERE 4
@@ -3486,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3486
3557
  #define NWARPS_Q4_1_PASCAL 8
3487
3558
 
3488
3559
  template <bool need_check> static __global__ void
3489
- #if __CUDA_ARCH__ < CC_TURING
3560
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3561
+ #if defined(RDNA3) || defined(RDNA2)
3562
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3563
+ #endif // defined(RDNA3) || defined(RDNA2)
3564
+ #elif __CUDA_ARCH__ < CC_TURING
3490
3565
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3491
3566
  #endif // __CUDA_ARCH__ < CC_TURING
3492
3567
  mul_mat_q4_1(
3493
3568
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3494
3569
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3495
3570
 
3496
- #if __CUDA_ARCH__ >= CC_TURING
3571
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3572
+ #if defined(RDNA3) || defined(RDNA2)
3573
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3574
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3575
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3576
+ #else
3577
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3578
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3579
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3580
+ #endif // defined(RDNA3) || defined(RDNA2)
3581
+
3582
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3583
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3584
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3585
+
3586
+ #elif __CUDA_ARCH__ >= CC_TURING
3497
3587
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3498
3588
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3499
3589
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3516,6 +3606,12 @@ template <bool need_check> static __global__ void
3516
3606
  #endif // __CUDA_ARCH__ >= CC_TURING
3517
3607
  }
3518
3608
 
3609
+ #define MMQ_X_Q5_0_RDNA2 64
3610
+ #define MMQ_Y_Q5_0_RDNA2 128
3611
+ #define NWARPS_Q5_0_RDNA2 8
3612
+ #define MMQ_X_Q5_0_RDNA1 64
3613
+ #define MMQ_Y_Q5_0_RDNA1 64
3614
+ #define NWARPS_Q5_0_RDNA1 8
3519
3615
  #define MMQ_X_Q5_0_AMPERE 128
3520
3616
  #define MMQ_Y_Q5_0_AMPERE 64
3521
3617
  #define NWARPS_Q5_0_AMPERE 4
@@ -3523,11 +3619,32 @@ template <bool need_check> static __global__ void
3523
3619
  #define MMQ_Y_Q5_0_PASCAL 64
3524
3620
  #define NWARPS_Q5_0_PASCAL 8
3525
3621
 
3526
- template <bool need_check> static __global__ void mul_mat_q5_0(
3622
+ template <bool need_check> static __global__ void
3623
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3624
+ #if defined(RDNA3) || defined(RDNA2)
3625
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3626
+ #endif // defined(RDNA3) || defined(RDNA2)
3627
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3628
+ mul_mat_q5_0(
3527
3629
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3528
3630
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3529
3631
 
3530
- #if __CUDA_ARCH__ >= CC_TURING
3632
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3633
+ #if defined(RDNA3) || defined(RDNA2)
3634
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3635
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3636
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3637
+ #else
3638
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3639
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3640
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3641
+ #endif // defined(RDNA3) || defined(RDNA2)
3642
+
3643
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3644
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3645
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3646
+
3647
+ #elif __CUDA_ARCH__ >= CC_TURING
3531
3648
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3532
3649
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3533
3650
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3550,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3550
3667
  #endif // __CUDA_ARCH__ >= CC_TURING
3551
3668
  }
3552
3669
 
3670
+ #define MMQ_X_Q5_1_RDNA2 64
3671
+ #define MMQ_Y_Q5_1_RDNA2 128
3672
+ #define NWARPS_Q5_1_RDNA2 8
3673
+ #define MMQ_X_Q5_1_RDNA1 64
3674
+ #define MMQ_Y_Q5_1_RDNA1 64
3675
+ #define NWARPS_Q5_1_RDNA1 8
3553
3676
  #define MMQ_X_Q5_1_AMPERE 128
3554
3677
  #define MMQ_Y_Q5_1_AMPERE 64
3555
3678
  #define NWARPS_Q5_1_AMPERE 4
@@ -3557,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3557
3680
  #define MMQ_Y_Q5_1_PASCAL 64
3558
3681
  #define NWARPS_Q5_1_PASCAL 8
3559
3682
 
3560
- template <bool need_check> static __global__ void mul_mat_q5_1(
3683
+ template <bool need_check> static __global__ void
3684
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3685
+ #if defined(RDNA3) || defined(RDNA2)
3686
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3687
+ #endif // defined(RDNA3) || defined(RDNA2)
3688
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3689
+ mul_mat_q5_1(
3561
3690
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3562
3691
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3563
3692
 
3564
- #if __CUDA_ARCH__ >= CC_TURING
3693
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3694
+ #if defined(RDNA3) || defined(RDNA2)
3695
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3696
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3697
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3698
+ #else
3699
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3700
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3701
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3702
+ #endif // defined(RDNA3) || defined(RDNA2)
3703
+
3704
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3705
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3706
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3707
+
3708
+ #elif __CUDA_ARCH__ >= CC_TURING
3565
3709
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3566
3710
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3567
3711
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3584,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3584
3728
  #endif // __CUDA_ARCH__ >= CC_TURING
3585
3729
  }
3586
3730
 
3731
+ #define MMQ_X_Q8_0_RDNA2 64
3732
+ #define MMQ_Y_Q8_0_RDNA2 128
3733
+ #define NWARPS_Q8_0_RDNA2 8
3734
+ #define MMQ_X_Q8_0_RDNA1 64
3735
+ #define MMQ_Y_Q8_0_RDNA1 64
3736
+ #define NWARPS_Q8_0_RDNA1 8
3587
3737
  #define MMQ_X_Q8_0_AMPERE 128
3588
3738
  #define MMQ_Y_Q8_0_AMPERE 64
3589
3739
  #define NWARPS_Q8_0_AMPERE 4
@@ -3591,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3591
3741
  #define MMQ_Y_Q8_0_PASCAL 64
3592
3742
  #define NWARPS_Q8_0_PASCAL 8
3593
3743
 
3594
- template <bool need_check> static __global__ void mul_mat_q8_0(
3744
+ template <bool need_check> static __global__ void
3745
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3746
+ #if defined(RDNA3) || defined(RDNA2)
3747
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3748
+ #endif // defined(RDNA3) || defined(RDNA2)
3749
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3750
+ mul_mat_q8_0(
3595
3751
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3596
3752
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3597
3753
 
3598
- #if __CUDA_ARCH__ >= CC_TURING
3754
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3755
+ #if defined(RDNA3) || defined(RDNA2)
3756
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3757
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3758
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3759
+ #else
3760
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3761
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3762
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3763
+ #endif // defined(RDNA3) || defined(RDNA2)
3764
+
3765
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3766
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3767
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3768
+
3769
+ #elif __CUDA_ARCH__ >= CC_TURING
3599
3770
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3600
3771
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3601
3772
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3618,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3618
3789
  #endif // __CUDA_ARCH__ >= CC_TURING
3619
3790
  }
3620
3791
 
3792
+ #define MMQ_X_Q2_K_RDNA2 64
3793
+ #define MMQ_Y_Q2_K_RDNA2 128
3794
+ #define NWARPS_Q2_K_RDNA2 8
3795
+ #define MMQ_X_Q2_K_RDNA1 128
3796
+ #define MMQ_Y_Q2_K_RDNA1 32
3797
+ #define NWARPS_Q2_K_RDNA1 8
3621
3798
  #define MMQ_X_Q2_K_AMPERE 64
3622
3799
  #define MMQ_Y_Q2_K_AMPERE 128
3623
3800
  #define NWARPS_Q2_K_AMPERE 4
@@ -3625,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3625
3802
  #define MMQ_Y_Q2_K_PASCAL 64
3626
3803
  #define NWARPS_Q2_K_PASCAL 8
3627
3804
 
3628
- template <bool need_check> static __global__ void mul_mat_q2_K(
3805
+ template <bool need_check> static __global__ void
3806
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3807
+ #if defined(RDNA3) || defined(RDNA2)
3808
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3809
+ #endif // defined(RDNA3) || defined(RDNA2)
3810
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3811
+ mul_mat_q2_K(
3629
3812
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3630
3813
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3631
3814
 
3632
- #if __CUDA_ARCH__ >= CC_TURING
3815
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3816
+ #if defined(RDNA3) || defined(RDNA2)
3817
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3818
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3819
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3820
+ #else
3821
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3822
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3823
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3824
+ #endif // defined(RDNA3) || defined(RDNA2)
3825
+
3826
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3827
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3828
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3829
+
3830
+ #elif __CUDA_ARCH__ >= CC_TURING
3633
3831
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3634
3832
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3635
3833
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3652,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3652
3850
  #endif // __CUDA_ARCH__ >= CC_TURING
3653
3851
  }
3654
3852
 
3853
+ #define MMQ_X_Q3_K_RDNA2 128
3854
+ #define MMQ_Y_Q3_K_RDNA2 64
3855
+ #define NWARPS_Q3_K_RDNA2 8
3856
+ #define MMQ_X_Q3_K_RDNA1 32
3857
+ #define MMQ_Y_Q3_K_RDNA1 128
3858
+ #define NWARPS_Q3_K_RDNA1 8
3655
3859
  #define MMQ_X_Q3_K_AMPERE 128
3656
3860
  #define MMQ_Y_Q3_K_AMPERE 128
3657
3861
  #define NWARPS_Q3_K_AMPERE 4
@@ -3660,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3660
3864
  #define NWARPS_Q3_K_PASCAL 8
3661
3865
 
3662
3866
  template <bool need_check> static __global__ void
3663
- #if __CUDA_ARCH__ < CC_TURING
3867
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3868
+ #if defined(RDNA3) || defined(RDNA2)
3869
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3870
+ #endif // defined(RDNA3) || defined(RDNA2)
3871
+ #elif __CUDA_ARCH__ < CC_TURING
3664
3872
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3665
3873
  #endif // __CUDA_ARCH__ < CC_TURING
3666
3874
  mul_mat_q3_K(
3667
3875
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3668
3876
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3669
3877
 
3670
- #if __CUDA_ARCH__ >= CC_TURING
3878
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3879
+ #if defined(RDNA3) || defined(RDNA2)
3880
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3881
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3882
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3883
+ #else
3884
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3885
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3886
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3887
+ #endif // defined(RDNA3) || defined(RDNA2)
3888
+
3889
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3890
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3891
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3892
+
3893
+ #elif __CUDA_ARCH__ >= CC_TURING
3671
3894
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3672
3895
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3673
3896
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3690,6 +3913,12 @@ template <bool need_check> static __global__ void
3690
3913
  #endif // __CUDA_ARCH__ >= CC_TURING
3691
3914
  }
3692
3915
 
3916
+ #define MMQ_X_Q4_K_RDNA2 64
3917
+ #define MMQ_Y_Q4_K_RDNA2 128
3918
+ #define NWARPS_Q4_K_RDNA2 8
3919
+ #define MMQ_X_Q4_K_RDNA1 32
3920
+ #define MMQ_Y_Q4_K_RDNA1 64
3921
+ #define NWARPS_Q4_K_RDNA1 8
3693
3922
  #define MMQ_X_Q4_K_AMPERE 64
3694
3923
  #define MMQ_Y_Q4_K_AMPERE 128
3695
3924
  #define NWARPS_Q4_K_AMPERE 4
@@ -3698,14 +3927,33 @@ template <bool need_check> static __global__ void
3698
3927
  #define NWARPS_Q4_K_PASCAL 8
3699
3928
 
3700
3929
  template <bool need_check> static __global__ void
3701
- #if __CUDA_ARCH__ < CC_TURING
3930
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3931
+ #if defined(RDNA3) || defined(RDNA2)
3932
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3933
+ #endif // defined(RDNA3) || defined(RDNA2)
3934
+ #elif __CUDA_ARCH__ < CC_TURING
3702
3935
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3703
3936
  #endif // __CUDA_ARCH__ < CC_TURING
3704
3937
  mul_mat_q4_K(
3705
3938
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3706
3939
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3707
3940
 
3708
- #if __CUDA_ARCH__ >= CC_TURING
3941
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3942
+ #if defined(RDNA3) || defined(RDNA2)
3943
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3944
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3945
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3946
+ #else
3947
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3948
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3949
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3950
+ #endif // defined(RDNA3) || defined(RDNA2)
3951
+
3952
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3953
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3954
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
+
3956
+ #elif __CUDA_ARCH__ >= CC_TURING
3709
3957
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3710
3958
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3711
3959
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3728,6 +3976,12 @@ template <bool need_check> static __global__ void
3728
3976
  #endif // __CUDA_ARCH__ >= CC_TURING
3729
3977
  }
3730
3978
 
3979
+ #define MMQ_X_Q5_K_RDNA2 64
3980
+ #define MMQ_Y_Q5_K_RDNA2 128
3981
+ #define NWARPS_Q5_K_RDNA2 8
3982
+ #define MMQ_X_Q5_K_RDNA1 32
3983
+ #define MMQ_Y_Q5_K_RDNA1 64
3984
+ #define NWARPS_Q5_K_RDNA1 8
3731
3985
  #define MMQ_X_Q5_K_AMPERE 64
3732
3986
  #define MMQ_Y_Q5_K_AMPERE 128
3733
3987
  #define NWARPS_Q5_K_AMPERE 4
@@ -3735,11 +3989,32 @@ template <bool need_check> static __global__ void
3735
3989
  #define MMQ_Y_Q5_K_PASCAL 64
3736
3990
  #define NWARPS_Q5_K_PASCAL 8
3737
3991
 
3738
- template <bool need_check> static __global__ void mul_mat_q5_K(
3992
+ template <bool need_check> static __global__ void
3993
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3994
+ #if defined(RDNA3) || defined(RDNA2)
3995
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
3996
+ #endif // defined(RDNA3) || defined(RDNA2)
3997
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3998
+ mul_mat_q5_K(
3739
3999
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3740
4000
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3741
4001
 
3742
- #if __CUDA_ARCH__ >= CC_TURING
4002
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4003
+ #if defined(RDNA3) || defined(RDNA2)
4004
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4005
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4006
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4007
+ #else
4008
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4009
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4010
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4011
+ #endif // defined(RDNA3) || defined(RDNA2)
4012
+
4013
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4014
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4015
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4016
+
4017
+ #elif __CUDA_ARCH__ >= CC_TURING
3743
4018
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3744
4019
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3745
4020
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3762,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3762
4037
  #endif // __CUDA_ARCH__ >= CC_TURING
3763
4038
  }
3764
4039
 
4040
+ #define MMQ_X_Q6_K_RDNA2 64
4041
+ #define MMQ_Y_Q6_K_RDNA2 128
4042
+ #define NWARPS_Q6_K_RDNA2 8
4043
+ #define MMQ_X_Q6_K_RDNA1 32
4044
+ #define MMQ_Y_Q6_K_RDNA1 64
4045
+ #define NWARPS_Q6_K_RDNA1 8
3765
4046
  #define MMQ_X_Q6_K_AMPERE 64
3766
4047
  #define MMQ_Y_Q6_K_AMPERE 64
3767
4048
  #define NWARPS_Q6_K_AMPERE 4
@@ -3770,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3770
4051
  #define NWARPS_Q6_K_PASCAL 8
3771
4052
 
3772
4053
  template <bool need_check> static __global__ void
3773
- #if __CUDA_ARCH__ < CC_TURING
4054
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4055
+ #if defined(RDNA3) || defined(RDNA2)
4056
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4057
+ #endif // defined(RDNA3) || defined(RDNA2)
4058
+ #elif __CUDA_ARCH__ < CC_TURING
3774
4059
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3775
4060
  #endif // __CUDA_ARCH__ < CC_TURING
3776
4061
  mul_mat_q6_K(
3777
4062
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3778
4063
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3779
4064
 
3780
- #if __CUDA_ARCH__ >= CC_TURING
4065
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4066
+ #if defined(RDNA3) || defined(RDNA2)
4067
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4068
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4069
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4070
+ #else
4071
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4072
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4073
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4074
+ #endif // defined(RDNA3) || defined(RDNA2)
4075
+
4076
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4077
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4078
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4079
+
4080
+ #elif __CUDA_ARCH__ >= CC_TURING
3781
4081
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3782
4082
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3783
4083
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4086,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4086
4386
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4087
4387
  }
4088
4388
 
4089
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4389
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4090
4391
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4091
4392
  const int half_n_dims = ncols/4;
4092
4393
 
@@ -4098,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4098
4399
  const int i = row*ncols + col;
4099
4400
 
4100
4401
  const float col_theta_scale = powf(theta_scale, col);
4402
+ const float p = p0 + p_delta*(row/p_delta_rows);
4101
4403
 
4102
- const float theta = p*col_theta_scale;
4404
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4103
4405
  const float sin_theta = sinf(theta);
4104
4406
  const float cos_theta = cosf(theta);
4105
4407
 
@@ -4109,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4109
4411
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4110
4412
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4111
4413
 
4112
- const float block_theta = block_p*col_theta_scale;
4414
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4113
4415
  const float sin_block_theta = sinf(block_theta);
4114
4416
  const float cos_block_theta = cosf(block_theta);
4115
4417
 
@@ -4558,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4558
4860
  const int compute_capability = g_compute_capabilities[id];
4559
4861
 
4560
4862
  int mmq_x, mmq_y, nwarps;
4561
- if (compute_capability >= CC_TURING) {
4863
+ if (compute_capability >= CC_RDNA2) {
4864
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4865
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4866
+ nwarps = NWARPS_Q4_0_RDNA2;
4867
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4868
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4869
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4870
+ nwarps = NWARPS_Q4_0_RDNA1;
4871
+ } else if (compute_capability >= CC_TURING) {
4562
4872
  mmq_x = MMQ_X_Q4_0_AMPERE;
4563
4873
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4564
4874
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4595,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4595
4905
  const int compute_capability = g_compute_capabilities[id];
4596
4906
 
4597
4907
  int mmq_x, mmq_y, nwarps;
4598
- if (compute_capability >= CC_TURING) {
4908
+ if (compute_capability >= CC_RDNA2) {
4909
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4910
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4911
+ nwarps = NWARPS_Q4_1_RDNA2;
4912
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4913
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4914
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4915
+ nwarps = NWARPS_Q4_1_RDNA1;
4916
+ } else if (compute_capability >= CC_TURING) {
4599
4917
  mmq_x = MMQ_X_Q4_1_AMPERE;
4600
4918
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4601
4919
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4632,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4632
4950
  const int compute_capability = g_compute_capabilities[id];
4633
4951
 
4634
4952
  int mmq_x, mmq_y, nwarps;
4635
- if (compute_capability >= CC_TURING) {
4953
+ if (compute_capability >= CC_RDNA2) {
4954
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4955
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4956
+ nwarps = NWARPS_Q5_0_RDNA2;
4957
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4958
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4959
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4960
+ nwarps = NWARPS_Q5_0_RDNA1;
4961
+ } else if (compute_capability >= CC_TURING) {
4636
4962
  mmq_x = MMQ_X_Q5_0_AMPERE;
4637
4963
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4638
4964
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4669,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4669
4995
  const int compute_capability = g_compute_capabilities[id];
4670
4996
 
4671
4997
  int mmq_x, mmq_y, nwarps;
4672
- if (compute_capability >= CC_TURING) {
4998
+ if (compute_capability >= CC_RDNA2) {
4999
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5000
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5001
+ nwarps = NWARPS_Q5_1_RDNA2;
5002
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5003
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5004
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5005
+ nwarps = NWARPS_Q5_1_RDNA1;
5006
+ } else if (compute_capability >= CC_TURING) {
4673
5007
  mmq_x = MMQ_X_Q5_1_AMPERE;
4674
5008
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4675
5009
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4706,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4706
5040
  const int compute_capability = g_compute_capabilities[id];
4707
5041
 
4708
5042
  int mmq_x, mmq_y, nwarps;
4709
- if (compute_capability >= CC_TURING) {
5043
+ if (compute_capability >= CC_RDNA2) {
5044
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5045
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5046
+ nwarps = NWARPS_Q8_0_RDNA2;
5047
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5048
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5049
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5050
+ nwarps = NWARPS_Q8_0_RDNA1;
5051
+ } else if (compute_capability >= CC_TURING) {
4710
5052
  mmq_x = MMQ_X_Q8_0_AMPERE;
4711
5053
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4712
5054
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4743,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4743
5085
  const int compute_capability = g_compute_capabilities[id];
4744
5086
 
4745
5087
  int mmq_x, mmq_y, nwarps;
4746
- if (compute_capability >= CC_TURING) {
5088
+ if (compute_capability >= CC_RDNA2) {
5089
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5090
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5091
+ nwarps = NWARPS_Q2_K_RDNA2;
5092
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5093
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5094
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5095
+ nwarps = NWARPS_Q2_K_RDNA1;
5096
+ } else if (compute_capability >= CC_TURING) {
4747
5097
  mmq_x = MMQ_X_Q2_K_AMPERE;
4748
5098
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4749
5099
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4782,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4782
5132
  const int compute_capability = g_compute_capabilities[id];
4783
5133
 
4784
5134
  int mmq_x, mmq_y, nwarps;
4785
- if (compute_capability >= CC_TURING) {
5135
+ if (compute_capability >= CC_RDNA2) {
5136
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5137
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5138
+ nwarps = NWARPS_Q3_K_RDNA2;
5139
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5140
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5141
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5142
+ nwarps = NWARPS_Q3_K_RDNA1;
5143
+ } else if (compute_capability >= CC_TURING) {
4786
5144
  mmq_x = MMQ_X_Q3_K_AMPERE;
4787
5145
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4788
5146
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4820,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4820
5178
  const int compute_capability = g_compute_capabilities[id];
4821
5179
 
4822
5180
  int mmq_x, mmq_y, nwarps;
4823
- if (compute_capability >= CC_TURING) {
5181
+ if (compute_capability >= CC_RDNA2) {
5182
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5183
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5184
+ nwarps = NWARPS_Q4_K_RDNA2;
5185
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5186
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5187
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5188
+ nwarps = NWARPS_Q4_K_RDNA1;
5189
+ } else if (compute_capability >= CC_TURING) {
4824
5190
  mmq_x = MMQ_X_Q4_K_AMPERE;
4825
5191
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4826
5192
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4857,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4857
5223
  const int compute_capability = g_compute_capabilities[id];
4858
5224
 
4859
5225
  int mmq_x, mmq_y, nwarps;
4860
- if (compute_capability >= CC_TURING) {
5226
+ if (compute_capability >= CC_RDNA2) {
5227
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5228
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5229
+ nwarps = NWARPS_Q5_K_RDNA2;
5230
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5231
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5232
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5233
+ nwarps = NWARPS_Q5_K_RDNA1;
5234
+ } else if (compute_capability >= CC_TURING) {
4861
5235
  mmq_x = MMQ_X_Q5_K_AMPERE;
4862
5236
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4863
5237
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4894,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4894
5268
  const int compute_capability = g_compute_capabilities[id];
4895
5269
 
4896
5270
  int mmq_x, mmq_y, nwarps;
4897
- if (compute_capability >= CC_TURING) {
5271
+ if (compute_capability >= CC_RDNA2) {
5272
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5273
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5274
+ nwarps = NWARPS_Q6_K_RDNA2;
5275
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5276
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5277
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5278
+ nwarps = NWARPS_Q6_K_RDNA1;
5279
+ } else if (compute_capability >= CC_TURING) {
4898
5280
  mmq_x = MMQ_X_Q6_K_AMPERE;
4899
5281
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4900
5282
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4984,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4984
5366
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4985
5367
  }
4986
5368
 
4987
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4988
- GGML_ASSERT(nrows % 4 == 0);
4989
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4990
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5369
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
+ GGML_ASSERT(ncols % 4 == 0);
5372
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4991
5374
  const dim3 block_nums(num_blocks_x, nrows, 1);
4992
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5375
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4993
5376
  }
4994
5377
 
4995
5378
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5127,25 +5510,30 @@ void ggml_init_cublas() {
5127
5510
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5128
5511
  int64_t total_vram = 0;
5129
5512
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5130
- for (int id = 0; id < g_device_count; ++id) {
5513
+ for (int64_t id = 0; id < g_device_count; ++id) {
5131
5514
  cudaDeviceProp prop;
5132
5515
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5516
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5134
5517
 
5135
5518
  g_tensor_split[id] = total_vram;
5136
5519
  total_vram += prop.totalGlobalMem;
5137
-
5520
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5521
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5522
+ #else
5138
5523
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5524
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5139
5525
  }
5140
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5141
5527
  g_tensor_split[id] /= total_vram;
5142
5528
  }
5143
5529
 
5144
- for (int id = 0; id < g_device_count; ++id) {
5145
- CUDA_CHECK(cudaSetDevice(id));
5530
+ for (int64_t id = 0; id < g_device_count; ++id) {
5531
+ CUDA_CHECK(ggml_cuda_set_device(id));
5146
5532
 
5147
- // create main stream
5148
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5533
+ // create cuda streams
5534
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5535
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5536
+ }
5149
5537
 
5150
5538
  // create cublas handle
5151
5539
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5214,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5214
5602
  if (src->backend == GGML_BACKEND_CPU) {
5215
5603
  kind = cudaMemcpyHostToDevice;
5216
5604
  src_ptr = (char *) src->data;
5217
- } else if (src->backend == GGML_BACKEND_GPU) {
5605
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5606
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5218
5607
  kind = cudaMemcpyDeviceToDevice;
5219
5608
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5220
5609
  int id;
@@ -5253,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5253
5642
  }
5254
5643
 
5255
5644
  inline void ggml_cuda_op_add(
5256
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5257
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5258
- cudaStream_t & cudaStream_main){
5645
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5646
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5259
5647
 
5260
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5261
- GGML_ASSERT(src1_ddf_i != nullptr);
5262
- GGML_ASSERT(dst_ddf_i != nullptr);
5263
-
5264
- const int64_t ne00 = src0->ne[0];
5265
- const int64_t i01_diff = i01_high - i01_low;
5648
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5266
5649
 
5267
5650
  const int64_t ne10 = src1->ne[0];
5268
5651
  const int64_t ne11 = src1->ne[1];
5269
5652
 
5270
- // compute
5271
5653
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5272
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5654
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5273
5655
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5274
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5656
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5275
5657
  } else {
5276
5658
  GGML_ASSERT(false);
5277
5659
  }
5278
5660
 
5279
5661
  (void) src1;
5280
5662
  (void) dst;
5281
- (void) src0_ddq_i;
5282
- (void) i02;
5283
- (void) i1;
5284
5663
  }
5285
5664
 
5286
5665
  inline void ggml_cuda_op_mul(
5287
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5288
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5289
- cudaStream_t & cudaStream_main){
5290
-
5291
- GGML_ASSERT(src0_ddf_i != nullptr);
5292
- GGML_ASSERT(src1_ddf_i != nullptr);
5293
- GGML_ASSERT(dst_ddf_i != nullptr);
5666
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5667
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5294
5668
 
5295
- const int64_t ne00 = src0->ne[0];
5296
- const int64_t i01_diff = i01_high - i01_low;
5669
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5670
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5671
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5297
5672
 
5298
5673
  const int64_t ne10 = src1->ne[0];
5299
5674
  const int64_t ne11 = src1->ne[1];
5300
5675
 
5301
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5676
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5302
5677
 
5303
5678
  (void) dst;
5304
- (void) src0_ddq_i;
5305
- (void) i02;
5306
- (void) i1;
5307
5679
  }
5308
5680
 
5309
5681
  inline void ggml_cuda_op_gelu(
5310
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5311
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5312
- cudaStream_t & cudaStream_main){
5682
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5683
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5313
5684
 
5314
- GGML_ASSERT(src0_ddf_i != nullptr);
5315
- GGML_ASSERT(dst_ddf_i != nullptr);
5685
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5686
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5316
5687
 
5317
- const int64_t ne00 = src0->ne[0];
5318
- const int64_t i01_diff = i01_high - i01_low;
5319
-
5320
- // compute
5321
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5688
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5322
5689
 
5323
5690
  (void) src1;
5324
5691
  (void) dst;
5325
- (void) src0_ddq_i;
5326
- (void) src1_ddf_i;
5327
- (void) i02;
5328
- (void) i1;
5692
+ (void) src1_dd;
5329
5693
  }
5330
5694
 
5331
5695
  inline void ggml_cuda_op_silu(
5332
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5333
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5334
- cudaStream_t & cudaStream_main){
5696
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5697
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5335
5698
 
5336
- GGML_ASSERT(src0_ddf_i != nullptr);
5337
- GGML_ASSERT(dst_ddf_i != nullptr);
5699
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5700
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5338
5701
 
5339
- const int64_t ne00 = src0->ne[0];
5340
- const int64_t i01_diff = i01_high - i01_low;
5341
-
5342
- // compute
5343
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5702
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5344
5703
 
5345
5704
  (void) src1;
5346
5705
  (void) dst;
5347
- (void) src0_ddq_i;
5348
- (void) src1_ddf_i;
5349
- (void) i02;
5350
- (void) i1;
5706
+ (void) src1_dd;
5351
5707
  }
5352
5708
 
5353
5709
  inline void ggml_cuda_op_norm(
5354
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5355
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5356
- cudaStream_t & cudaStream_main){
5710
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5711
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5357
5712
 
5358
- GGML_ASSERT(src0_ddf_i != nullptr);
5359
- GGML_ASSERT(dst_ddf_i != nullptr);
5713
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5714
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5360
5715
 
5361
5716
  const int64_t ne00 = src0->ne[0];
5362
- const int64_t i01_diff = i01_high - i01_low;
5717
+ const int64_t nrows = ggml_nrows(src0);
5363
5718
 
5364
- // compute
5365
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5719
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5366
5720
 
5367
5721
  (void) src1;
5368
5722
  (void) dst;
5369
- (void) src0_ddq_i;
5370
- (void) src1_ddf_i;
5371
- (void) i02;
5372
- (void) i1;
5723
+ (void) src1_dd;
5373
5724
  }
5374
5725
 
5375
5726
  inline void ggml_cuda_op_rms_norm(
5376
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5377
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5378
- cudaStream_t & cudaStream_main){
5727
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5728
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5379
5729
 
5380
- GGML_ASSERT(src0_ddf_i != nullptr);
5381
- GGML_ASSERT(dst_ddf_i != nullptr);
5730
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5731
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5382
5732
 
5383
5733
  const int64_t ne00 = src0->ne[0];
5384
- const int64_t i01_diff = i01_high - i01_low;
5734
+ const int64_t nrows = ggml_nrows(src0);
5385
5735
 
5386
5736
  float eps;
5387
5737
  memcpy(&eps, dst->op_params, sizeof(float));
5388
5738
 
5389
- // compute
5390
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5739
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5391
5740
 
5392
5741
  (void) src1;
5393
5742
  (void) dst;
5394
- (void) src0_ddq_i;
5395
- (void) src1_ddf_i;
5396
- (void) i02;
5397
- (void) i1;
5743
+ (void) src1_dd;
5398
5744
  }
5399
5745
 
5400
5746
  inline void ggml_cuda_op_mul_mat_q(
5401
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5402
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5403
- cudaStream_t & cudaStream_main){
5404
-
5405
- GGML_ASSERT(src0_ddq_i != nullptr);
5406
- GGML_ASSERT(src1_ddf_i != nullptr);
5407
- GGML_ASSERT(dst_ddf_i != nullptr);
5747
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5748
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5749
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5408
5750
 
5409
5751
  const int64_t ne00 = src0->ne[0];
5410
5752
 
5411
5753
  const int64_t ne10 = src1->ne[0];
5412
- const int64_t ne11 = src1->ne[1];
5413
5754
  GGML_ASSERT(ne10 % QK8_1 == 0);
5414
5755
 
5415
5756
  const int64_t ne0 = dst->ne[0];
5416
5757
 
5417
- const int64_t i01_diff = i01_high - i01_low;
5758
+ const int64_t row_diff = row_high - row_low;
5418
5759
 
5419
5760
  int id;
5420
5761
  CUDA_CHECK(cudaGetDevice(&id));
5421
5762
 
5422
5763
  // the main device has a larger memory buffer to hold the results from all GPUs
5423
5764
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5424
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5425
-
5426
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5427
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5428
- size_t as;
5429
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5430
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5765
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5431
5766
 
5432
5767
  switch (src0->type) {
5433
5768
  case GGML_TYPE_Q4_0:
5434
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5769
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5435
5770
  break;
5436
5771
  case GGML_TYPE_Q4_1:
5437
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5772
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5438
5773
  break;
5439
5774
  case GGML_TYPE_Q5_0:
5440
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5775
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5441
5776
  break;
5442
5777
  case GGML_TYPE_Q5_1:
5443
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5778
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5444
5779
  break;
5445
5780
  case GGML_TYPE_Q8_0:
5446
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5781
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5447
5782
  break;
5448
5783
  case GGML_TYPE_Q2_K:
5449
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5784
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5450
5785
  break;
5451
5786
  case GGML_TYPE_Q3_K:
5452
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5787
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5453
5788
  break;
5454
5789
  case GGML_TYPE_Q4_K:
5455
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5790
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5456
5791
  break;
5457
5792
  case GGML_TYPE_Q5_K:
5458
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5793
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5459
5794
  break;
5460
5795
  case GGML_TYPE_Q6_K:
5461
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5796
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5462
5797
  break;
5463
5798
  default:
5464
5799
  GGML_ASSERT(false);
5465
5800
  break;
5466
5801
  }
5467
5802
 
5468
- ggml_cuda_pool_free(src1_q8_1, as);
5469
-
5470
5803
  (void) src1;
5471
5804
  (void) dst;
5472
- (void) src0_ddf_i;
5473
- (void) i02;
5474
- (void) i1;
5805
+ (void) src1_ddf_i;
5475
5806
  }
5476
5807
 
5477
5808
  static int64_t get_row_rounding(ggml_type type) {
5478
- int max_compute_capability = INT_MIN;
5479
- for (int id = 0; id < g_device_count; ++id) {
5480
- if (max_compute_capability < g_compute_capabilities[id]
5481
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5482
- max_compute_capability = g_compute_capabilities[id];
5809
+ int64_t min_compute_capability = INT_MAX;
5810
+ int64_t max_compute_capability = INT_MIN;
5811
+ for (int64_t id = 0; id < g_device_count; ++id) {
5812
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5813
+ if (min_compute_capability > g_compute_capabilities[id]) {
5814
+ min_compute_capability = g_compute_capabilities[id];
5815
+ }
5816
+ if (max_compute_capability < g_compute_capabilities[id]) {
5817
+ max_compute_capability = g_compute_capabilities[id];
5818
+ }
5483
5819
  }
5484
5820
  }
5485
5821
 
5822
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5823
+ switch(type) {
5824
+ case GGML_TYPE_Q4_0:
5825
+ case GGML_TYPE_Q4_1:
5826
+ case GGML_TYPE_Q5_0:
5827
+ case GGML_TYPE_Q5_1:
5828
+ case GGML_TYPE_Q8_0:
5829
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5830
+ case GGML_TYPE_F16:
5831
+ return 1;
5832
+ case GGML_TYPE_Q2_K:
5833
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5834
+ case GGML_TYPE_Q3_K:
5835
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5836
+ case GGML_TYPE_Q4_K:
5837
+ case GGML_TYPE_Q5_K:
5838
+ case GGML_TYPE_Q6_K:
5839
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5840
+ default:
5841
+ GGML_ASSERT(false);
5842
+ }
5843
+ #else
5486
5844
  switch(type) {
5487
5845
  case GGML_TYPE_Q4_0:
5488
5846
  case GGML_TYPE_Q4_1:
@@ -5503,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
5503
5861
  default:
5504
5862
  GGML_ASSERT(false);
5505
5863
  }
5864
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5506
5865
  }
5507
5866
 
5508
- inline void ggml_cuda_op_mul_mat_vec(
5509
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5510
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5511
- cudaStream_t & cudaStream_main){
5512
-
5513
- GGML_ASSERT(src0_ddq_i != nullptr);
5514
- GGML_ASSERT(src1_ddf_i != nullptr);
5515
- GGML_ASSERT(dst_ddf_i != nullptr);
5867
+ inline void ggml_cuda_op_mul_mat_vec_q(
5868
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5869
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5870
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5516
5871
 
5517
5872
  const int64_t ne00 = src0->ne[0];
5518
- const int64_t nrows = i01_high - i01_low;
5873
+ const int64_t row_diff = row_high - row_low;
5519
5874
 
5520
- #ifdef GGML_CUDA_FORCE_DMMV
5521
- const bool use_mul_mat_vec_q = false;
5522
- (void) g_compute_capabilities[0];
5523
- #else
5524
- int id;
5525
- CUDA_CHECK(cudaGetDevice(&id));
5875
+ switch (src0->type) {
5876
+ case GGML_TYPE_Q4_0:
5877
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5878
+ break;
5879
+ case GGML_TYPE_Q4_1:
5880
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5881
+ break;
5882
+ case GGML_TYPE_Q5_0:
5883
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5884
+ break;
5885
+ case GGML_TYPE_Q5_1:
5886
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5887
+ break;
5888
+ case GGML_TYPE_Q8_0:
5889
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5890
+ break;
5891
+ case GGML_TYPE_Q2_K:
5892
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5893
+ break;
5894
+ case GGML_TYPE_Q3_K:
5895
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5896
+ break;
5897
+ case GGML_TYPE_Q4_K:
5898
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5899
+ break;
5900
+ case GGML_TYPE_Q5_K:
5901
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5902
+ break;
5903
+ case GGML_TYPE_Q6_K:
5904
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5905
+ break;
5906
+ default:
5907
+ GGML_ASSERT(false);
5908
+ break;
5909
+ }
5526
5910
 
5527
- bool mul_mat_vec_q_implemented =
5528
- src0->type == GGML_TYPE_Q4_0 ||
5529
- src0->type == GGML_TYPE_Q4_1 ||
5530
- src0->type == GGML_TYPE_Q5_0 ||
5531
- src0->type == GGML_TYPE_Q5_1 ||
5532
- src0->type == GGML_TYPE_Q8_0;
5533
- #if QK_K == 256
5534
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5535
- src0->type == GGML_TYPE_Q2_K ||
5536
- src0->type == GGML_TYPE_Q3_K ||
5537
- src0->type == GGML_TYPE_Q4_K ||
5538
- src0->type == GGML_TYPE_Q5_K ||
5539
- src0->type == GGML_TYPE_Q6_K;
5540
- #endif // QK_K == 256
5541
-
5542
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5543
- #endif
5911
+ (void) src1;
5912
+ (void) dst;
5913
+ (void) src1_ddf_i;
5914
+ (void) src1_ncols;
5915
+ (void) src1_padded_row_size;
5916
+ }
5544
5917
 
5545
- if (use_mul_mat_vec_q) {
5546
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5547
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5548
- size_t as;
5549
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5550
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5551
-
5552
- switch (src0->type) {
5553
- case GGML_TYPE_Q4_0:
5554
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q4_1:
5557
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_0:
5560
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q5_1:
5563
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q8_0:
5566
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q2_K:
5569
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q3_K:
5572
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q4_K:
5575
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q5_K:
5578
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_Q6_K:
5581
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5918
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5919
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5920
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5921
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5587
5922
 
5588
- ggml_cuda_pool_free(src1_q8_1, as);
5589
- } else {
5590
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5923
+ const int64_t ne00 = src0->ne[0];
5924
+ const int64_t row_diff = row_high - row_low;
5925
+
5926
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5591
5927
  #ifdef GGML_CUDA_F16
5592
- size_t ash;
5593
- dfloat * src1_dfloat = nullptr; // dfloat == half
5594
-
5595
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5596
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5597
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5598
-
5599
- if (src1_convert_f16) {
5600
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5601
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5602
- ne00, 1, sizeof(float), 0, 0,
5603
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5604
- }
5928
+ size_t ash;
5929
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5930
+
5931
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5932
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5933
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5934
+
5935
+ if (src1_convert_f16) {
5936
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5937
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5938
+ ne00, 1, sizeof(float), 0, 0,
5939
+ ne00, 1, sizeof(half), 0, 0, stream);
5940
+ }
5605
5941
  #else
5606
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5942
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5607
5943
  #endif // GGML_CUDA_F16
5608
5944
 
5609
- switch (src0->type) {
5610
- case GGML_TYPE_Q4_0:
5611
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5612
- break;
5613
- case GGML_TYPE_Q4_1:
5614
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5615
- break;
5616
- case GGML_TYPE_Q5_0:
5617
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5618
- break;
5619
- case GGML_TYPE_Q5_1:
5620
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5621
- break;
5622
- case GGML_TYPE_Q8_0:
5623
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5624
- break;
5625
- case GGML_TYPE_Q2_K:
5626
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5627
- break;
5628
- case GGML_TYPE_Q3_K:
5629
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5630
- break;
5631
- case GGML_TYPE_Q4_K:
5632
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5633
- break;
5634
- case GGML_TYPE_Q5_K:
5635
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5636
- break;
5637
- case GGML_TYPE_Q6_K:
5638
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5639
- break;
5640
- case GGML_TYPE_F16:
5641
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5642
- break;
5643
- default:
5644
- GGML_ASSERT(false);
5645
- break;
5646
- }
5945
+ switch (src0->type) {
5946
+ case GGML_TYPE_Q4_0:
5947
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5948
+ break;
5949
+ case GGML_TYPE_Q4_1:
5950
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5951
+ break;
5952
+ case GGML_TYPE_Q5_0:
5953
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5954
+ break;
5955
+ case GGML_TYPE_Q5_1:
5956
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5957
+ break;
5958
+ case GGML_TYPE_Q8_0:
5959
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5960
+ break;
5961
+ case GGML_TYPE_Q2_K:
5962
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5963
+ break;
5964
+ case GGML_TYPE_Q3_K:
5965
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5966
+ break;
5967
+ case GGML_TYPE_Q4_K:
5968
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5969
+ break;
5970
+ case GGML_TYPE_Q5_K:
5971
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5972
+ break;
5973
+ case GGML_TYPE_Q6_K:
5974
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5975
+ break;
5976
+ case GGML_TYPE_F16:
5977
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5978
+ break;
5979
+ default:
5980
+ GGML_ASSERT(false);
5981
+ break;
5982
+ }
5647
5983
 
5648
5984
  #ifdef GGML_CUDA_F16
5649
- if (src1_convert_f16) {
5650
- ggml_cuda_pool_free(src1_dfloat, ash);
5651
- }
5652
- #endif // GGML_CUDA_F16
5985
+ if (src1_convert_f16) {
5986
+ ggml_cuda_pool_free(src1_dfloat, ash);
5653
5987
  }
5988
+ #endif // GGML_CUDA_F16
5654
5989
 
5655
5990
  (void) src1;
5656
5991
  (void) dst;
5657
- (void) src0_ddf_i;
5658
- (void) i02;
5659
- (void) i1;
5992
+ (void) src1_ddq_i;
5993
+ (void) src1_ncols;
5994
+ (void) src1_padded_row_size;
5660
5995
  }
5661
5996
 
5662
5997
  inline void ggml_cuda_op_mul_mat_cublas(
5663
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5664
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5665
- cudaStream_t & cudaStream_main){
5998
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5999
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6000
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5666
6001
 
5667
- GGML_ASSERT(src0_ddf_i != nullptr);
6002
+ GGML_ASSERT(src0_dd_i != nullptr);
5668
6003
  GGML_ASSERT(src1_ddf_i != nullptr);
5669
- GGML_ASSERT(dst_ddf_i != nullptr);
6004
+ GGML_ASSERT(dst_dd_i != nullptr);
5670
6005
 
5671
6006
  const float alpha = 1.0f;
5672
6007
  const float beta = 0.0f;
@@ -5674,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5674
6009
  const int64_t ne00 = src0->ne[0];
5675
6010
 
5676
6011
  const int64_t ne10 = src1->ne[0];
5677
- const int64_t ne11 = src1->ne[1];
5678
6012
 
5679
6013
  const int64_t ne0 = dst->ne[0];
5680
- const int64_t i01_diff = i01_high - i01_low;
6014
+ const int64_t row_diff = row_high - row_low;
6015
+
6016
+ float * src0_ddq_as_f32;
6017
+ size_t src0_as = 0;
6018
+
6019
+ if (src0->type != GGML_TYPE_F32) {
6020
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
+ }
6024
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5681
6025
 
5682
6026
  int id;
5683
6027
  CUDA_CHECK(cudaGetDevice(&id));
5684
6028
 
5685
6029
  // the main device has a larger memory buffer to hold the results from all GPUs
5686
6030
  // ldc == nrows of the matrix that cuBLAS writes into
5687
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6031
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5688
6032
 
5689
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6033
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5690
6034
  CUBLAS_CHECK(
5691
6035
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5692
- i01_diff, ne11, ne10,
6036
+ row_diff, src1_ncols, ne10,
5693
6037
  &alpha, src0_ddf_i, ne00,
5694
- src1_ddf_i, ne10,
5695
- &beta, dst_ddf_i, ldc));
6038
+ src1_ddf_i, ne10,
6039
+ &beta, dst_dd_i, ldc));
6040
+
6041
+ if (src0_as > 0) {
6042
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6043
+ }
5696
6044
 
5697
6045
  (void) dst;
5698
- (void) src0_ddq_i;
5699
- (void) i02;
5700
- (void) i1;
6046
+ (void) src1_ddq_i;
6047
+ (void) src1_padded_row_size;
5701
6048
  }
5702
6049
 
5703
6050
  inline void ggml_cuda_op_rope(
5704
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5705
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5706
- cudaStream_t & cudaStream_main){
6051
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5707
6053
 
5708
- GGML_ASSERT(src0_ddf_i != nullptr);
5709
- GGML_ASSERT(dst_ddf_i != nullptr);
6054
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5710
6056
 
5711
6057
  const int64_t ne00 = src0->ne[0];
5712
6058
  const int64_t ne01 = src0->ne[1];
5713
- const int64_t i01_diff = i01_high - i01_low;
6059
+ const int64_t nrows = ggml_nrows(src0);
5714
6060
 
5715
6061
  const int n_past = ((int32_t *) dst->op_params)[0];
5716
6062
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5723,44 +6069,37 @@ inline void ggml_cuda_op_rope(
5723
6069
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5724
6070
 
5725
6071
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5726
6073
 
5727
6074
  const bool is_neox = mode & 2;
5728
6075
  const bool is_glm = mode & 4;
5729
6076
 
5730
6077
  // compute
5731
6078
  if (is_glm) {
5732
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5733
- const float id_p = min(p, n_ctx - 2.f);
5734
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5735
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6079
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5736
6080
  } else if (is_neox) {
5737
6081
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5738
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5739
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6082
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5740
6083
  } else {
5741
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5742
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6084
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5743
6085
  }
5744
6086
 
5745
6087
  (void) src1;
5746
6088
  (void) dst;
5747
- (void) src0_ddq_i;
5748
- (void) src1_ddf_i;
5749
- (void) i1;
6089
+ (void) src1_dd;
5750
6090
  }
5751
6091
 
5752
6092
  inline void ggml_cuda_op_alibi(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6093
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6094
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6095
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6096
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6097
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6098
 
5760
6099
  const int64_t ne00 = src0->ne[0];
5761
6100
  const int64_t ne01 = src0->ne[1];
5762
6101
  const int64_t ne02 = src0->ne[2];
5763
- const int64_t i01_diff = i01_high - i01_low;
6102
+ const int64_t nrows = ggml_nrows(src0);
5764
6103
 
5765
6104
  const int n_past = ((int32_t *) dst->op_params)[0];
5766
6105
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5775,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
5775
6114
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5776
6115
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5777
6116
 
5778
- // compute
5779
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6117
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5780
6118
 
5781
6119
  (void) src1;
5782
- (void) src0_ddq_i;
5783
- (void) src1_ddf_i;
5784
- (void) i1;
6120
+ (void) src1_dd;
5785
6121
  }
5786
6122
 
5787
6123
  inline void ggml_cuda_op_diag_mask_inf(
5788
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5789
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5790
- cudaStream_t & cudaStream_main){
6124
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6125
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5791
6126
 
5792
- GGML_ASSERT(src0_ddf_i != nullptr);
5793
- GGML_ASSERT(dst_ddf_i != nullptr);
6127
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6128
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5794
6129
 
5795
6130
  const int64_t ne00 = src0->ne[0];
5796
6131
  const int64_t ne01 = src0->ne[1];
5797
- const int64_t i01_diff = i01_high - i01_low;
6132
+ const int nrows0 = ggml_nrows(src0);
5798
6133
 
5799
6134
  const int n_past = ((int32_t *) dst->op_params)[0];
5800
6135
 
5801
- // compute
5802
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6136
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5803
6137
 
5804
6138
  (void) src1;
5805
6139
  (void) dst;
5806
- (void) src0_ddq_i;
5807
- (void) src1_ddf_i;
5808
- (void) i02;
5809
- (void) i1;
6140
+ (void) src1_dd;
5810
6141
  }
5811
6142
 
5812
6143
  inline void ggml_cuda_op_soft_max(
5813
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5814
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5815
- cudaStream_t & cudaStream_main){
6144
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6145
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5816
6146
 
5817
- GGML_ASSERT(src0_ddf_i != nullptr);
5818
- GGML_ASSERT(dst_ddf_i != nullptr);
6147
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6148
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5819
6149
 
5820
6150
  const int64_t ne00 = src0->ne[0];
5821
- const int64_t i01_diff = i01_high - i01_low;
6151
+ const int64_t nrows = ggml_nrows(src0);
5822
6152
 
5823
- // compute
5824
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6153
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5825
6154
 
5826
6155
  (void) src1;
5827
6156
  (void) dst;
5828
- (void) src0_ddq_i;
5829
- (void) src1_ddf_i;
5830
- (void) i02;
5831
- (void) i1;
6157
+ (void) src1_dd;
5832
6158
  }
5833
6159
 
5834
6160
  inline void ggml_cuda_op_scale(
5835
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5836
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5837
- cudaStream_t & cudaStream_main){
6161
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6162
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5838
6163
 
5839
- GGML_ASSERT(src0_ddf_i != nullptr);
5840
- GGML_ASSERT(dst_ddf_i != nullptr);
6164
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6165
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6166
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5841
6167
 
5842
6168
  const float scale = ((float *) src1->data)[0];
5843
6169
 
5844
- const int64_t ne00 = src0->ne[0];
5845
- const int64_t i01_diff = i01_high - i01_low;
5846
-
5847
- // compute
5848
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6170
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5849
6171
  CUDA_CHECK(cudaGetLastError());
5850
6172
 
5851
6173
  (void) src1;
5852
6174
  (void) dst;
5853
- (void) src0_ddq_i;
5854
- (void) src1_ddf_i;
5855
- (void) i02;
5856
- (void) i1;
6175
+ (void) src1_dd;
6176
+ }
6177
+
6178
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6179
+ const int64_t nrows0 = ggml_nrows(src0);
6180
+
6181
+ const bool use_src1 = src1 != nullptr;
6182
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6183
+
6184
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6185
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6186
+
6187
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6188
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6189
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6190
+
6191
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6192
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6193
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6194
+
6195
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6196
+
6197
+ // dd = data device
6198
+ float * src0_ddf = nullptr;
6199
+ float * src1_ddf = nullptr;
6200
+ float * dst_ddf = nullptr;
6201
+
6202
+ // as = actual size
6203
+ size_t src0_asf = 0;
6204
+ size_t src1_asf = 0;
6205
+ size_t dst_asf = 0;
6206
+
6207
+ ggml_cuda_set_device(g_main_device);
6208
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6209
+
6210
+ if (src0_on_device) {
6211
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6212
+ } else {
6213
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6215
+ }
6216
+
6217
+ if (use_src1 && !src1_stays_on_host) {
6218
+ if (src1_on_device) {
6219
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6220
+ } else {
6221
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6222
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6223
+ }
6224
+ }
6225
+ if (dst_on_device) {
6226
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6227
+ } else {
6228
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6229
+ }
6230
+
6231
+ // do the computation
6232
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6233
+ CUDA_CHECK(cudaGetLastError());
6234
+
6235
+ // copy dst to host if necessary
6236
+ if (!dst_on_device) {
6237
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6238
+ }
6239
+
6240
+ if (src0_asf > 0) {
6241
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6242
+ }
6243
+ if (src1_asf > 0) {
6244
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6245
+ }
6246
+ if (dst_asf > 0) {
6247
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6248
+ }
6249
+
6250
+ if (dst->backend == GGML_BACKEND_CPU) {
6251
+ CUDA_CHECK(cudaDeviceSynchronize());
6252
+ }
5857
6253
  }
5858
6254
 
5859
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5860
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6255
+ static void ggml_cuda_op_mul_mat(
6256
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
+ const bool convert_src1_to_q8_1) {
6258
+
5861
6259
  const int64_t ne00 = src0->ne[0];
5862
6260
  const int64_t ne01 = src0->ne[1];
5863
6261
  const int64_t ne02 = src0->ne[2];
5864
6262
  const int64_t ne03 = src0->ne[3];
5865
6263
  const int64_t nrows0 = ggml_nrows(src0);
5866
6264
 
5867
- const bool use_src1 = src1 != nullptr;
5868
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5869
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5870
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5871
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5872
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6265
+ const int64_t ne10 = src1->ne[0];
6266
+ const int64_t ne11 = src1->ne[1];
6267
+ const int64_t ne12 = src1->ne[2];
6268
+ const int64_t ne13 = src1->ne[3];
6269
+ const int64_t nrows1 = ggml_nrows(src1);
5873
6270
 
5874
6271
  GGML_ASSERT(ne03 == ne13);
5875
6272
 
5876
6273
  const int64_t ne0 = dst->ne[0];
5877
6274
  const int64_t ne1 = dst->ne[1];
5878
6275
 
5879
- const int nb2 = dst->nb[2];
5880
- const int nb3 = dst->nb[3];
6276
+ const int nb2 = dst->nb[2];
6277
+ const int nb3 = dst->nb[3];
5881
6278
 
5882
6279
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5883
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6280
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5884
6281
 
5885
- // strides for iteration over dims 3 and 2
5886
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5887
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5888
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5889
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5890
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5891
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6282
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5892
6283
 
5893
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5894
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5895
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5896
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5897
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6284
+ const int64_t i02_divisor = ne12 / ne02;
5898
6285
 
5899
6286
  const size_t src0_ts = ggml_type_size(src0->type);
5900
6287
  const size_t src0_bs = ggml_blck_size(src0->type);
6288
+ const size_t q8_1_ts = sizeof(block_q8_1);
6289
+ const size_t q8_1_bs = QK8_1;
5901
6290
 
5902
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5903
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5904
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6291
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6292
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6293
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5905
6294
 
5906
6295
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5907
6296
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5908
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5909
6297
 
5910
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5911
- const bool src1_stays_on_host = use_src1 && (
5912
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6298
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6299
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6300
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5913
6301
 
5914
6302
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6303
+ GGML_ASSERT(!(split && ne02 > 1));
6304
+ GGML_ASSERT(!(split && ne03 > 1));
5915
6305
  GGML_ASSERT(!(split && ne02 < ne12));
5916
6306
 
5917
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5918
-
5919
6307
  // dd = data device
5920
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5921
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5922
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5923
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5924
-
5925
- // asq = actual size quantized, asf = actual size float
5926
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5927
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
6308
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6309
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6310
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6311
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6312
+
6313
+ // as = actual size
6314
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
5928
6315
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5929
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6316
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6317
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5930
6318
 
5931
- // if multiple devices are used they need to wait for the main device
5932
- // here an event is recorded that signifies that the main device has finished calculating the input data
5933
- if (split && g_device_count > 1) {
5934
- CUDA_CHECK(cudaSetDevice(g_main_device));
5935
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5936
- }
6319
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6320
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5937
6321
 
5938
- for (int id = 0; id < g_device_count; ++id) {
5939
- if (!split && id != g_main_device) {
5940
- continue;
5941
- }
6322
+ for (int64_t id = 0; id < g_device_count; ++id) {
6323
+ // by default, use all rows
6324
+ row_low[id] = 0;
6325
+ row_high[id] = ne01;
5942
6326
 
5943
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5944
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5945
-
5946
- int64_t row_low, row_high;
6327
+ // for multi GPU, get the row boundaries from tensor split
6328
+ // and round to mul_mat_q tile sizes
5947
6329
  if (split) {
5948
6330
  const int64_t rounding = get_row_rounding(src0->type);
5949
6331
 
5950
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5951
- row_low -= row_low % rounding;
6332
+ if (id != 0) {
6333
+ row_low[id] = ne01*g_tensor_split[id];
6334
+ row_low[id] -= row_low[id] % rounding;
6335
+ }
5952
6336
 
5953
- if (id == g_device_count - 1) {
5954
- row_high = nrows0;
5955
- } else {
5956
- row_high = nrows0*g_tensor_split[id + 1];
5957
- row_high -= row_high % rounding;
6337
+ if (id != g_device_count - 1) {
6338
+ row_high[id] = ne01*g_tensor_split[id + 1];
6339
+ row_high[id] -= row_high[id] % rounding;
5958
6340
  }
5959
- } else {
5960
- row_low = 0;
5961
- row_high = nrows0*i02_divisor;
5962
6341
  }
5963
- if (row_low == row_high) {
6342
+ }
6343
+
6344
+ for (int64_t id = 0; id < g_device_count; ++id) {
6345
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5964
6346
  continue;
5965
6347
  }
5966
6348
 
5967
- int64_t row_diff = row_high - row_low;
5968
-
5969
- cudaSetDevice(id);
5970
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6349
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6350
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5971
6351
 
5972
- // wait for main GPU data if necessary
5973
- if (split && id != g_main_device) {
5974
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5975
- }
6352
+ ggml_cuda_set_device(id);
6353
+ const cudaStream_t stream = g_cudaStreams[id][0];
5976
6354
 
5977
6355
  if (src0_on_device && src0_is_contiguous) {
5978
- if (src0_is_f32) {
5979
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5980
- } else {
5981
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5982
- }
6356
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5983
6357
  } else {
5984
- if (src0_is_f32) {
5985
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5986
- } else {
5987
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5988
- }
6358
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6359
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5989
6360
  }
5990
6361
 
5991
- if (src0_needs_f32 && !src0_is_f32) {
5992
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6362
+ if (src1_on_device && src1_is_contiguous) {
6363
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6364
+ } else {
6365
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5993
6366
  }
5994
6367
 
5995
- if (use_src1 && !src1_stays_on_host) {
5996
- if (src1_on_device && src1_is_contiguous) {
5997
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5998
- } else {
5999
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6368
+ if (convert_src1_to_q8_1) {
6369
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6370
+
6371
+ if (split && src1_on_device && src1_is_contiguous) {
6372
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6373
+ CUDA_CHECK(cudaGetLastError());
6000
6374
  }
6001
6375
  }
6376
+
6002
6377
  if (dst_on_device) {
6003
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6378
+ dst_dd[id] = (float *) dst_extra->data_device[id];
6004
6379
  } else {
6005
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
6006
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6380
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6381
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
6007
6382
  }
6383
+ }
6008
6384
 
6009
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
6010
- const int64_t i13 = i03 % ne13;
6011
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
6012
- const int64_t i12 = i02 % ne12;
6385
+ // if multiple devices are used they need to wait for the main device
6386
+ // here an event is recorded that signals that the main device has finished calculating the input data
6387
+ if (split && g_device_count > 1) {
6388
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6389
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6390
+ }
6013
6391
 
6014
- const int64_t i0 = i03*i02_max + i02;
6392
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6393
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6394
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6395
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
6015
6396
 
6016
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6017
- const int64_t i0_offset_low = row_low/rows_per_iter;
6018
- const int64_t i0_offset_high = row_high/rows_per_iter;
6397
+ for (int64_t id = 0; id < g_device_count; ++id) {
6398
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6399
+ continue;
6400
+ }
6019
6401
 
6020
- int64_t i01_low = 0;
6021
- int64_t i01_high = rows_per_iter;
6022
- if (split) {
6023
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
6024
- continue;
6025
- }
6026
- if (i0 == i0_offset_low) {
6027
- i01_low = row_low % rows_per_iter;
6028
- }
6029
- if (i0 == i0_offset_high) {
6030
- i01_high = row_high % rows_per_iter;
6031
- }
6032
- }
6402
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6403
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6404
+ const int64_t row_diff = row_high[id] - row_low[id];
6033
6405
 
6034
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6035
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6036
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6037
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6038
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
6039
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6406
+ ggml_cuda_set_device(id);
6407
+ const cudaStream_t stream = g_cudaStreams[id][is];
6040
6408
 
6041
- const int64_t i01_diff = i01_high - i01_low;
6042
- if (i01_diff == 0) {
6043
- continue;
6044
- }
6045
- const int64_t i11 = i13*ne12 + i12;
6409
+ // wait for main GPU data if necessary
6410
+ if (split && (id != g_main_device || is != 0)) {
6411
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6412
+ }
6413
+
6414
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6415
+ const int64_t i03 = i0 / ne12;
6416
+ const int64_t i02 = i0 % ne12;
6417
+
6418
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
6046
6419
 
6047
6420
  // for split tensors the data begins at i0 == i0_offset_low
6048
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6049
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6050
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6051
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6052
-
6053
- // for split tensors the data pointer needs to be rounded down
6054
- // to the bin edge for i03, i02 bins beyond the first
6055
- if (i0 - i0_offset_low > 0) {
6056
- GGML_ASSERT(!flatten_rows);
6057
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6058
- src0_ddf_i -= (row_low % ne01)*ne00;
6059
- dst_ddf_i -= (row_low % ne0)*ne1;
6060
- }
6421
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6422
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6423
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6424
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6061
6425
 
6062
6426
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6063
6427
  // in that case an offset on dst_ddf_i is needed
6064
6428
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6065
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6429
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6066
6430
  }
6067
6431
 
6068
6432
  // copy src0, src1 to device if necessary
6069
- if (use_src1 && !src1_stays_on_host) {
6070
- if (src1->backend == GGML_BACKEND_CPU) {
6071
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6072
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6073
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6074
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6075
- if (id != g_main_device) {
6076
- GGML_ASSERT(!flatten_rows);
6433
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6434
+ if (id != g_main_device) {
6435
+ if (convert_src1_to_q8_1) {
6436
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6437
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6438
+ cudaMemcpyDeviceToDevice, stream));
6439
+ } else {
6077
6440
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6078
- src1_ddf_i_source += i11*src1_stride;
6079
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6080
- cudaMemcpyDeviceToDevice, cudaStream_main));
6441
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6442
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6443
+ cudaMemcpyDeviceToDevice, stream));
6081
6444
  }
6082
- } else if (src1_on_device && !src1_is_contiguous) {
6083
- GGML_ASSERT(!split);
6084
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6085
- } else {
6086
- GGML_ASSERT(false);
6087
6445
  }
6446
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6447
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6448
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6449
+ } else {
6450
+ GGML_ASSERT(false);
6088
6451
  }
6089
6452
 
6090
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6091
- if (src0_is_f32) {
6092
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6093
- } else {
6094
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6095
- }
6453
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6454
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6455
+ CUDA_CHECK(cudaGetLastError());
6096
6456
  }
6097
6457
 
6098
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6099
- if (src0_needs_f32 && !src0_is_f32) {
6100
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6101
- CUDA_CHECK(cudaGetLastError());
6458
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6459
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6102
6460
  }
6103
6461
 
6104
6462
  // do the computation
6105
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6463
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6464
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6106
6465
  CUDA_CHECK(cudaGetLastError());
6107
6466
 
6108
6467
  // copy dst to host or other device if necessary
@@ -6124,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6124
6483
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6125
6484
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6126
6485
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6127
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6128
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6129
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6486
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6487
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6488
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6489
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6490
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6130
6491
  } else {
6131
6492
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6132
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6493
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6494
+ dhf_dst_i += src1_col_0*ne0;
6495
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6133
6496
  }
6134
6497
  }
6135
6498
 
6136
- // signify to main device that other device is done
6137
- if (split && g_device_count > 1 && id != g_main_device) {
6138
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6499
+ // add event for the main device to wait on until other device is done
6500
+ if (split && (id != g_main_device || is != 0)) {
6501
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6139
6502
  }
6140
6503
  }
6141
6504
  }
6142
6505
  }
6143
6506
 
6144
- // wait until each device is finished, then free their buffers
6145
- for (int id = 0; id < g_device_count; ++id) {
6146
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6147
- continue;
6148
- }
6149
-
6150
- CUDA_CHECK(cudaSetDevice(id));
6507
+ for (int64_t id = 0; id < g_device_count; ++id) {
6508
+ CUDA_CHECK(ggml_cuda_set_device(id));
6151
6509
 
6152
- if (src0_asq[id] > 0) {
6153
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6154
- }
6155
- if (src0_asf[id] > 0) {
6156
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6510
+ // free buffers again when done
6511
+ if (src0_as[id] > 0) {
6512
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6157
6513
  }
6158
6514
  if (src1_asf[id] > 0) {
6159
6515
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6160
6516
  }
6161
- if (dst_asf[id] > 0) {
6162
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6517
+ if (src1_asq[id] > 0) {
6518
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6519
+ }
6520
+ if (dst_as[id] > 0) {
6521
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6163
6522
  }
6164
6523
  }
6165
6524
 
6166
6525
  // main device waits for all other devices to be finished
6167
6526
  if (split && g_device_count > 1) {
6168
- CUDA_CHECK(cudaSetDevice(g_main_device));
6169
- for (int id = 0; id < g_device_count; ++id) {
6170
- if (id != g_main_device && src0_extra->events[id]) {
6171
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6527
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6528
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6529
+
6530
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
+ for (int64_t id = 0; id < g_device_count; ++id) {
6532
+ for (int64_t is = 0; is < is_max; ++is) {
6533
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6172
6534
  }
6173
6535
  }
6174
6536
  }
6175
6537
 
6176
6538
  if (dst->backend == GGML_BACKEND_CPU) {
6177
- CUDA_CHECK(cudaSetDevice(g_main_device));
6539
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6178
6540
  CUDA_CHECK(cudaDeviceSynchronize());
6179
6541
  }
6180
6542
  }
6181
6543
 
6182
6544
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6183
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6184
- // Due to flatten_rows == true this does in practice not make a difference however.
6185
- // Better solution would be nice but right now that would require disproportionate changes.
6186
- GGML_ASSERT(
6187
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6188
- src1->type == GGML_TYPE_F32 &&
6189
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6190
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6545
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6191
6546
  }
6192
6547
 
6193
6548
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6194
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6195
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6549
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6196
6550
  }
6197
6551
 
6198
6552
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6199
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6200
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6553
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6201
6554
  }
6202
6555
 
6203
6556
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6204
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6205
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6557
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6206
6558
  }
6207
6559
 
6208
6560
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6209
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6210
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6561
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6211
6562
  }
6212
6563
 
6213
6564
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6214
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6215
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6565
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6216
6566
  }
6217
6567
 
6218
6568
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6246,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6246
6596
 
6247
6597
  const int64_t ne12 = src1->ne[2];
6248
6598
 
6249
- CUDA_CHECK(cudaSetDevice(g_main_device));
6250
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6599
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6600
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6251
6601
 
6252
6602
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6253
6603
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6258,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6258
6608
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6259
6609
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6260
6610
 
6261
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6611
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6262
6612
  }
6263
6613
 
6264
6614
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6277,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6277
6627
  const int64_t nb01 = src0->nb[1];
6278
6628
  const int64_t nb02 = src0->nb[2];
6279
6629
 
6280
- CUDA_CHECK(cudaSetDevice(g_main_device));
6281
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6630
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6631
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6282
6632
 
6283
6633
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6284
6634
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6289,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6289
6639
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6290
6640
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6291
6641
 
6292
- const int row_stride_x = nb01 / sizeof(half);
6293
- const int channel_stride_x = nb02 / sizeof(half);
6642
+ const int64_t row_stride_x = nb01 / sizeof(half);
6643
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6294
6644
 
6295
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6645
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6296
6646
  }
6297
6647
 
6298
6648
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6299
6649
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6300
6650
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6301
6651
 
6652
+ int64_t min_compute_capability = INT_MAX;
6653
+ for (int64_t id = 0; id < g_device_count; ++id) {
6654
+ if (min_compute_capability > g_compute_capabilities[id]
6655
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6656
+ min_compute_capability = g_compute_capabilities[id];
6657
+ }
6658
+ }
6659
+
6302
6660
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6303
6661
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6304
6662
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6305
6663
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6306
6664
  }else if (src0->type == GGML_TYPE_F32) {
6307
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6665
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6308
6666
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6309
6667
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6310
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6311
- } else {
6312
- int min_compute_capability = INT_MAX;
6313
- for (int id = 0; id < g_device_count; ++id) {
6314
- if (min_compute_capability > g_compute_capabilities[id]
6315
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6316
- min_compute_capability = g_compute_capabilities[id];
6317
- }
6318
- }
6319
6668
 
6669
+ #ifdef GGML_CUDA_FORCE_DMMV
6670
+ const bool use_mul_mat_vec_q = false;
6671
+ #else
6672
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6673
+ #endif // GGML_CUDA_FORCE_DMMV
6674
+
6675
+ if (use_mul_mat_vec_q) {
6676
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6677
+ } else {
6678
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6679
+ }
6680
+ } else {
6320
6681
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6321
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6682
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6322
6683
  } else {
6323
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6684
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6324
6685
  }
6325
6686
  }
6326
6687
  } else {
@@ -6329,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6329
6690
  }
6330
6691
 
6331
6692
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6332
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6333
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6693
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6334
6694
  }
6335
6695
 
6336
6696
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6359,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6359
6719
  const int64_t nb11 = src1->nb[1];
6360
6720
  const int64_t nb12 = src1->nb[2];
6361
6721
 
6362
- CUDA_CHECK(cudaSetDevice(g_main_device));
6363
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6722
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6723
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6364
6724
 
6365
6725
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
6726
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6370,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6370
6730
 
6371
6731
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6372
6732
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6373
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6733
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6374
6734
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6375
6735
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6376
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6736
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6377
6737
  } else {
6378
6738
  GGML_ASSERT(false);
6379
6739
  }
@@ -6387,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6387
6747
  }
6388
6748
 
6389
6749
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6390
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6391
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6750
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6392
6751
  }
6393
6752
 
6394
6753
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6395
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6396
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6754
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6397
6755
  }
6398
6756
 
6399
6757
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6400
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
6758
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6402
-
6403
- const int mode = ((int32_t *) dst->op_params)[2];
6404
- const bool is_glm = mode & 4;
6405
-
6406
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6759
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6407
6760
  }
6408
6761
 
6409
6762
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6410
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6411
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6763
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6412
6764
  }
6413
6765
 
6414
6766
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6418,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6418
6770
  }
6419
6771
 
6420
6772
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6421
- int nrows = ggml_nrows(tensor);
6773
+ const int64_t nrows = ggml_nrows(tensor);
6422
6774
 
6423
6775
  const int64_t ne0 = tensor->ne[0];
6424
6776
 
@@ -6428,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6428
6780
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6429
6781
  memset(extra, 0, sizeof(*extra));
6430
6782
 
6431
- for (int id = 0; id < g_device_count; ++id) {
6783
+ for (int64_t id = 0; id < g_device_count; ++id) {
6432
6784
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6433
6785
  continue;
6434
6786
  }
6435
6787
 
6436
- cudaSetDevice(id);
6788
+ ggml_cuda_set_device(id);
6437
6789
 
6438
- int row_low, row_high;
6790
+ int64_t row_low, row_high;
6439
6791
  if (backend == GGML_BACKEND_GPU) {
6440
6792
  row_low = 0;
6441
6793
  row_high = nrows;
@@ -6485,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6485
6837
  extra->data_device[id] = buf;
6486
6838
 
6487
6839
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6488
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6840
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6841
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6842
+ }
6489
6843
  }
6490
6844
  }
6491
6845
 
@@ -6499,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6499
6853
 
6500
6854
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6501
6855
 
6502
- for (int id = 0; id < g_device_count; ++id) {
6856
+ for (int64_t id = 0; id < g_device_count; ++id) {
6503
6857
  if (extra->data_device[id] != nullptr) {
6504
- CUDA_CHECK(cudaSetDevice(id));
6858
+ CUDA_CHECK(ggml_cuda_set_device(id));
6505
6859
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6506
6860
  }
6507
6861
 
6508
- if (extra->events[id] != nullptr) {
6509
- CUDA_CHECK(cudaSetDevice(id));
6510
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6862
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6863
+ if (extra->events[id][is] != nullptr) {
6864
+ CUDA_CHECK(ggml_cuda_set_device(id));
6865
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6866
+ }
6511
6867
  }
6512
6868
  }
6513
6869
 
@@ -6559,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6559
6915
  force_inplace;
6560
6916
  const size_t size = ggml_nbytes(tensor);
6561
6917
 
6562
- CUDA_CHECK(cudaSetDevice(g_main_device));
6918
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6563
6919
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6564
6920
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6565
6921
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];