llama_cpp 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -68,19 +68,29 @@
68
68
  #include <cuda_runtime.h>
69
69
  #include <cublas_v2.h>
70
70
  #include <cuda_fp16.h>
71
- #endif
71
+ #endif // defined(GGML_USE_HIPBLAS)
72
72
 
73
73
  #include "ggml-cuda.h"
74
74
  #include "ggml.h"
75
75
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
76
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #define CC_TURING 700
78
+ #define CC_OFFSET_AMD 1000000
79
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
80
 
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
85
+ defined(__gfx1150__) || defined(__gfx1151__)
86
+ #define RDNA3
87
+ #endif
88
+
89
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
90
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
91
+ #define RDNA2
92
+ #endif
93
+
84
94
  #ifndef __has_builtin
85
95
  #define __has_builtin(x) 0
86
96
  #endif
@@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
132
142
  #endif
133
143
  return c;
134
144
  }
135
- #endif
145
+ #endif // defined(GGML_USE_HIPBLAS)
136
146
 
137
147
  #if defined(_MSC_VER)
138
148
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -144,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
144
154
  do { \
145
155
  cudaError_t err_ = (err); \
146
156
  if (err_ != cudaSuccess) { \
147
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
157
+ int id; \
158
+ cudaGetDevice(&id); \
159
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
148
160
  cudaGetErrorString(err_)); \
161
+ fprintf(stderr, "current device: %d\n", id); \
149
162
  exit(1); \
150
163
  } \
151
164
  } while (0)
@@ -155,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
155
168
  do { \
156
169
  cublasStatus_t err_ = (err); \
157
170
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
171
+ int id; \
172
+ cudaGetDevice(&id); \
158
173
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
159
174
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
175
+ fprintf(stderr, "current device: %d\n", id); \
160
176
  exit(1); \
161
177
  } \
162
178
  } while (0)
@@ -165,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
165
181
  do { \
166
182
  cublasStatus_t err_ = (err); \
167
183
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
184
+ int id; \
185
+ cudaGetDevice(&id); \
168
186
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
187
+ fprintf(stderr, "current device: %d\n", id); \
169
188
  exit(1); \
170
189
  } \
171
190
  } while (0)
@@ -212,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
212
231
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
213
232
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
214
233
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
215
- typedef void (*ggml_cuda_op_t)(
216
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
217
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
218
- cudaStream_t & cudaStream_main);
234
+ typedef void (*ggml_cuda_op_mul_mat_t)(
235
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
236
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
237
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
238
+ typedef void (*ggml_cuda_op_flatten_t)(
239
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
240
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
219
241
 
220
242
  // QK = number of values after dequantization
221
243
  // QR = QK / number of values before dequantization
@@ -396,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
396
418
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
397
419
  #endif
398
420
 
421
+ #define MUL_MAT_SRC1_COL_STRIDE 128
422
+
423
+ #define MAX_STREAMS 8
424
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
425
+
399
426
  struct ggml_tensor_extra_gpu {
400
427
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
401
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
428
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
402
429
  };
403
430
 
431
+ // this is faster on Windows
432
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
433
+ inline cudaError_t ggml_cuda_set_device(const int device) {
434
+ int current_device;
435
+ CUDA_CHECK(cudaGetDevice(&current_device));
436
+
437
+ if (device == current_device) {
438
+ return cudaSuccess;
439
+ }
440
+
441
+ return cudaSetDevice(device);
442
+ }
443
+
404
444
  static int g_device_count = -1;
405
445
  static int g_main_device = 0;
406
446
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -413,8 +453,6 @@ static size_t g_scratch_offset = 0;
413
453
 
414
454
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
415
455
 
416
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
417
-
418
456
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
419
457
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
420
458
 
@@ -3444,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
3444
3482
  }
3445
3483
  }
3446
3484
 
3485
+ #define MMQ_X_Q4_0_RDNA2 64
3486
+ #define MMQ_Y_Q4_0_RDNA2 128
3487
+ #define NWARPS_Q4_0_RDNA2 8
3488
+ #define MMQ_X_Q4_0_RDNA1 64
3489
+ #define MMQ_Y_Q4_0_RDNA1 64
3490
+ #define NWARPS_Q4_0_RDNA1 8
3447
3491
  #define MMQ_X_Q4_0_AMPERE 64
3448
3492
  #define MMQ_Y_Q4_0_AMPERE 128
3449
3493
  #define NWARPS_Q4_0_AMPERE 4
@@ -3451,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
3451
3495
  #define MMQ_Y_Q4_0_PASCAL 64
3452
3496
  #define NWARPS_Q4_0_PASCAL 8
3453
3497
 
3454
- template <bool need_check> static __global__ void mul_mat_q4_0(
3498
+ template <bool need_check> static __global__ void
3499
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3500
+ #if defined(RDNA3) || defined(RDNA2)
3501
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3502
+ #endif // defined(RDNA3) || defined(RDNA2)
3503
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3504
+ mul_mat_q4_0(
3455
3505
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3456
3506
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3457
3507
 
3458
- #if __CUDA_ARCH__ >= CC_TURING
3508
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3509
+ #if defined(RDNA3) || defined(RDNA2)
3510
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3511
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3512
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3513
+ #else
3514
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3515
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3516
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3517
+ #endif // defined(RDNA3) || defined(RDNA2)
3518
+
3519
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3520
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3521
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3522
+
3523
+ #elif __CUDA_ARCH__ >= CC_TURING
3459
3524
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3460
3525
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3461
3526
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3478,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3478
3543
  #endif // __CUDA_ARCH__ >= CC_TURING
3479
3544
  }
3480
3545
 
3546
+ #define MMQ_X_Q4_1_RDNA2 64
3547
+ #define MMQ_Y_Q4_1_RDNA2 128
3548
+ #define NWARPS_Q4_1_RDNA2 8
3549
+ #define MMQ_X_Q4_1_RDNA1 64
3550
+ #define MMQ_Y_Q4_1_RDNA1 64
3551
+ #define NWARPS_Q4_1_RDNA1 8
3481
3552
  #define MMQ_X_Q4_1_AMPERE 64
3482
3553
  #define MMQ_Y_Q4_1_AMPERE 128
3483
3554
  #define NWARPS_Q4_1_AMPERE 4
@@ -3486,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3486
3557
  #define NWARPS_Q4_1_PASCAL 8
3487
3558
 
3488
3559
  template <bool need_check> static __global__ void
3489
- #if __CUDA_ARCH__ < CC_TURING
3560
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3561
+ #if defined(RDNA3) || defined(RDNA2)
3562
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3563
+ #endif // defined(RDNA3) || defined(RDNA2)
3564
+ #elif __CUDA_ARCH__ < CC_TURING
3490
3565
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3491
3566
  #endif // __CUDA_ARCH__ < CC_TURING
3492
3567
  mul_mat_q4_1(
3493
3568
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3494
3569
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3495
3570
 
3496
- #if __CUDA_ARCH__ >= CC_TURING
3571
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3572
+ #if defined(RDNA3) || defined(RDNA2)
3573
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3574
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3575
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3576
+ #else
3577
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3578
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3579
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3580
+ #endif // defined(RDNA3) || defined(RDNA2)
3581
+
3582
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3583
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3584
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3585
+
3586
+ #elif __CUDA_ARCH__ >= CC_TURING
3497
3587
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3498
3588
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3499
3589
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3516,6 +3606,12 @@ template <bool need_check> static __global__ void
3516
3606
  #endif // __CUDA_ARCH__ >= CC_TURING
3517
3607
  }
3518
3608
 
3609
+ #define MMQ_X_Q5_0_RDNA2 64
3610
+ #define MMQ_Y_Q5_0_RDNA2 128
3611
+ #define NWARPS_Q5_0_RDNA2 8
3612
+ #define MMQ_X_Q5_0_RDNA1 64
3613
+ #define MMQ_Y_Q5_0_RDNA1 64
3614
+ #define NWARPS_Q5_0_RDNA1 8
3519
3615
  #define MMQ_X_Q5_0_AMPERE 128
3520
3616
  #define MMQ_Y_Q5_0_AMPERE 64
3521
3617
  #define NWARPS_Q5_0_AMPERE 4
@@ -3523,11 +3619,32 @@ template <bool need_check> static __global__ void
3523
3619
  #define MMQ_Y_Q5_0_PASCAL 64
3524
3620
  #define NWARPS_Q5_0_PASCAL 8
3525
3621
 
3526
- template <bool need_check> static __global__ void mul_mat_q5_0(
3622
+ template <bool need_check> static __global__ void
3623
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3624
+ #if defined(RDNA3) || defined(RDNA2)
3625
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3626
+ #endif // defined(RDNA3) || defined(RDNA2)
3627
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3628
+ mul_mat_q5_0(
3527
3629
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3528
3630
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3529
3631
 
3530
- #if __CUDA_ARCH__ >= CC_TURING
3632
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3633
+ #if defined(RDNA3) || defined(RDNA2)
3634
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3635
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3636
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3637
+ #else
3638
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3639
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3640
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3641
+ #endif // defined(RDNA3) || defined(RDNA2)
3642
+
3643
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3644
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3645
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3646
+
3647
+ #elif __CUDA_ARCH__ >= CC_TURING
3531
3648
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3532
3649
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3533
3650
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3550,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3550
3667
  #endif // __CUDA_ARCH__ >= CC_TURING
3551
3668
  }
3552
3669
 
3670
+ #define MMQ_X_Q5_1_RDNA2 64
3671
+ #define MMQ_Y_Q5_1_RDNA2 128
3672
+ #define NWARPS_Q5_1_RDNA2 8
3673
+ #define MMQ_X_Q5_1_RDNA1 64
3674
+ #define MMQ_Y_Q5_1_RDNA1 64
3675
+ #define NWARPS_Q5_1_RDNA1 8
3553
3676
  #define MMQ_X_Q5_1_AMPERE 128
3554
3677
  #define MMQ_Y_Q5_1_AMPERE 64
3555
3678
  #define NWARPS_Q5_1_AMPERE 4
@@ -3557,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3557
3680
  #define MMQ_Y_Q5_1_PASCAL 64
3558
3681
  #define NWARPS_Q5_1_PASCAL 8
3559
3682
 
3560
- template <bool need_check> static __global__ void mul_mat_q5_1(
3683
+ template <bool need_check> static __global__ void
3684
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3685
+ #if defined(RDNA3) || defined(RDNA2)
3686
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3687
+ #endif // defined(RDNA3) || defined(RDNA2)
3688
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3689
+ mul_mat_q5_1(
3561
3690
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3562
3691
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3563
3692
 
3564
- #if __CUDA_ARCH__ >= CC_TURING
3693
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3694
+ #if defined(RDNA3) || defined(RDNA2)
3695
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3696
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3697
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3698
+ #else
3699
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3700
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3701
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3702
+ #endif // defined(RDNA3) || defined(RDNA2)
3703
+
3704
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3705
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3706
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3707
+
3708
+ #elif __CUDA_ARCH__ >= CC_TURING
3565
3709
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3566
3710
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3567
3711
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3584,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3584
3728
  #endif // __CUDA_ARCH__ >= CC_TURING
3585
3729
  }
3586
3730
 
3731
+ #define MMQ_X_Q8_0_RDNA2 64
3732
+ #define MMQ_Y_Q8_0_RDNA2 128
3733
+ #define NWARPS_Q8_0_RDNA2 8
3734
+ #define MMQ_X_Q8_0_RDNA1 64
3735
+ #define MMQ_Y_Q8_0_RDNA1 64
3736
+ #define NWARPS_Q8_0_RDNA1 8
3587
3737
  #define MMQ_X_Q8_0_AMPERE 128
3588
3738
  #define MMQ_Y_Q8_0_AMPERE 64
3589
3739
  #define NWARPS_Q8_0_AMPERE 4
@@ -3591,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3591
3741
  #define MMQ_Y_Q8_0_PASCAL 64
3592
3742
  #define NWARPS_Q8_0_PASCAL 8
3593
3743
 
3594
- template <bool need_check> static __global__ void mul_mat_q8_0(
3744
+ template <bool need_check> static __global__ void
3745
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3746
+ #if defined(RDNA3) || defined(RDNA2)
3747
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3748
+ #endif // defined(RDNA3) || defined(RDNA2)
3749
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3750
+ mul_mat_q8_0(
3595
3751
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3596
3752
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3597
3753
 
3598
- #if __CUDA_ARCH__ >= CC_TURING
3754
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3755
+ #if defined(RDNA3) || defined(RDNA2)
3756
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3757
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3758
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3759
+ #else
3760
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3761
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3762
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3763
+ #endif // defined(RDNA3) || defined(RDNA2)
3764
+
3765
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3766
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3767
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3768
+
3769
+ #elif __CUDA_ARCH__ >= CC_TURING
3599
3770
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3600
3771
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3601
3772
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3618,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3618
3789
  #endif // __CUDA_ARCH__ >= CC_TURING
3619
3790
  }
3620
3791
 
3792
+ #define MMQ_X_Q2_K_RDNA2 64
3793
+ #define MMQ_Y_Q2_K_RDNA2 128
3794
+ #define NWARPS_Q2_K_RDNA2 8
3795
+ #define MMQ_X_Q2_K_RDNA1 128
3796
+ #define MMQ_Y_Q2_K_RDNA1 32
3797
+ #define NWARPS_Q2_K_RDNA1 8
3621
3798
  #define MMQ_X_Q2_K_AMPERE 64
3622
3799
  #define MMQ_Y_Q2_K_AMPERE 128
3623
3800
  #define NWARPS_Q2_K_AMPERE 4
@@ -3625,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3625
3802
  #define MMQ_Y_Q2_K_PASCAL 64
3626
3803
  #define NWARPS_Q2_K_PASCAL 8
3627
3804
 
3628
- template <bool need_check> static __global__ void mul_mat_q2_K(
3805
+ template <bool need_check> static __global__ void
3806
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3807
+ #if defined(RDNA3) || defined(RDNA2)
3808
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3809
+ #endif // defined(RDNA3) || defined(RDNA2)
3810
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3811
+ mul_mat_q2_K(
3629
3812
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3630
3813
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3631
3814
 
3632
- #if __CUDA_ARCH__ >= CC_TURING
3815
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3816
+ #if defined(RDNA3) || defined(RDNA2)
3817
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3818
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3819
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3820
+ #else
3821
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3822
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3823
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3824
+ #endif // defined(RDNA3) || defined(RDNA2)
3825
+
3826
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3827
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3828
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3829
+
3830
+ #elif __CUDA_ARCH__ >= CC_TURING
3633
3831
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3634
3832
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3635
3833
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3652,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3652
3850
  #endif // __CUDA_ARCH__ >= CC_TURING
3653
3851
  }
3654
3852
 
3853
+ #define MMQ_X_Q3_K_RDNA2 128
3854
+ #define MMQ_Y_Q3_K_RDNA2 64
3855
+ #define NWARPS_Q3_K_RDNA2 8
3856
+ #define MMQ_X_Q3_K_RDNA1 32
3857
+ #define MMQ_Y_Q3_K_RDNA1 128
3858
+ #define NWARPS_Q3_K_RDNA1 8
3655
3859
  #define MMQ_X_Q3_K_AMPERE 128
3656
3860
  #define MMQ_Y_Q3_K_AMPERE 128
3657
3861
  #define NWARPS_Q3_K_AMPERE 4
@@ -3660,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3660
3864
  #define NWARPS_Q3_K_PASCAL 8
3661
3865
 
3662
3866
  template <bool need_check> static __global__ void
3663
- #if __CUDA_ARCH__ < CC_TURING
3867
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3868
+ #if defined(RDNA3) || defined(RDNA2)
3869
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3870
+ #endif // defined(RDNA3) || defined(RDNA2)
3871
+ #elif __CUDA_ARCH__ < CC_TURING
3664
3872
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3665
3873
  #endif // __CUDA_ARCH__ < CC_TURING
3666
3874
  mul_mat_q3_K(
3667
3875
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3668
3876
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3669
3877
 
3670
- #if __CUDA_ARCH__ >= CC_TURING
3878
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3879
+ #if defined(RDNA3) || defined(RDNA2)
3880
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3881
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3882
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3883
+ #else
3884
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3885
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3886
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3887
+ #endif // defined(RDNA3) || defined(RDNA2)
3888
+
3889
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3890
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3891
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3892
+
3893
+ #elif __CUDA_ARCH__ >= CC_TURING
3671
3894
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3672
3895
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3673
3896
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3690,6 +3913,12 @@ template <bool need_check> static __global__ void
3690
3913
  #endif // __CUDA_ARCH__ >= CC_TURING
3691
3914
  }
3692
3915
 
3916
+ #define MMQ_X_Q4_K_RDNA2 64
3917
+ #define MMQ_Y_Q4_K_RDNA2 128
3918
+ #define NWARPS_Q4_K_RDNA2 8
3919
+ #define MMQ_X_Q4_K_RDNA1 32
3920
+ #define MMQ_Y_Q4_K_RDNA1 64
3921
+ #define NWARPS_Q4_K_RDNA1 8
3693
3922
  #define MMQ_X_Q4_K_AMPERE 64
3694
3923
  #define MMQ_Y_Q4_K_AMPERE 128
3695
3924
  #define NWARPS_Q4_K_AMPERE 4
@@ -3698,14 +3927,33 @@ template <bool need_check> static __global__ void
3698
3927
  #define NWARPS_Q4_K_PASCAL 8
3699
3928
 
3700
3929
  template <bool need_check> static __global__ void
3701
- #if __CUDA_ARCH__ < CC_TURING
3930
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3931
+ #if defined(RDNA3) || defined(RDNA2)
3932
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3933
+ #endif // defined(RDNA3) || defined(RDNA2)
3934
+ #elif __CUDA_ARCH__ < CC_TURING
3702
3935
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3703
3936
  #endif // __CUDA_ARCH__ < CC_TURING
3704
3937
  mul_mat_q4_K(
3705
3938
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3706
3939
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3707
3940
 
3708
- #if __CUDA_ARCH__ >= CC_TURING
3941
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3942
+ #if defined(RDNA3) || defined(RDNA2)
3943
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3944
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3945
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3946
+ #else
3947
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3948
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3949
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3950
+ #endif // defined(RDNA3) || defined(RDNA2)
3951
+
3952
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3953
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3954
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
+
3956
+ #elif __CUDA_ARCH__ >= CC_TURING
3709
3957
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3710
3958
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3711
3959
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3728,6 +3976,12 @@ template <bool need_check> static __global__ void
3728
3976
  #endif // __CUDA_ARCH__ >= CC_TURING
3729
3977
  }
3730
3978
 
3979
+ #define MMQ_X_Q5_K_RDNA2 64
3980
+ #define MMQ_Y_Q5_K_RDNA2 128
3981
+ #define NWARPS_Q5_K_RDNA2 8
3982
+ #define MMQ_X_Q5_K_RDNA1 32
3983
+ #define MMQ_Y_Q5_K_RDNA1 64
3984
+ #define NWARPS_Q5_K_RDNA1 8
3731
3985
  #define MMQ_X_Q5_K_AMPERE 64
3732
3986
  #define MMQ_Y_Q5_K_AMPERE 128
3733
3987
  #define NWARPS_Q5_K_AMPERE 4
@@ -3735,11 +3989,32 @@ template <bool need_check> static __global__ void
3735
3989
  #define MMQ_Y_Q5_K_PASCAL 64
3736
3990
  #define NWARPS_Q5_K_PASCAL 8
3737
3991
 
3738
- template <bool need_check> static __global__ void mul_mat_q5_K(
3992
+ template <bool need_check> static __global__ void
3993
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3994
+ #if defined(RDNA3) || defined(RDNA2)
3995
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
3996
+ #endif // defined(RDNA3) || defined(RDNA2)
3997
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3998
+ mul_mat_q5_K(
3739
3999
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3740
4000
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3741
4001
 
3742
- #if __CUDA_ARCH__ >= CC_TURING
4002
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4003
+ #if defined(RDNA3) || defined(RDNA2)
4004
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4005
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4006
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4007
+ #else
4008
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4009
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4010
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4011
+ #endif // defined(RDNA3) || defined(RDNA2)
4012
+
4013
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4014
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4015
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4016
+
4017
+ #elif __CUDA_ARCH__ >= CC_TURING
3743
4018
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3744
4019
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3745
4020
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3762,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3762
4037
  #endif // __CUDA_ARCH__ >= CC_TURING
3763
4038
  }
3764
4039
 
4040
+ #define MMQ_X_Q6_K_RDNA2 64
4041
+ #define MMQ_Y_Q6_K_RDNA2 128
4042
+ #define NWARPS_Q6_K_RDNA2 8
4043
+ #define MMQ_X_Q6_K_RDNA1 32
4044
+ #define MMQ_Y_Q6_K_RDNA1 64
4045
+ #define NWARPS_Q6_K_RDNA1 8
3765
4046
  #define MMQ_X_Q6_K_AMPERE 64
3766
4047
  #define MMQ_Y_Q6_K_AMPERE 64
3767
4048
  #define NWARPS_Q6_K_AMPERE 4
@@ -3770,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3770
4051
  #define NWARPS_Q6_K_PASCAL 8
3771
4052
 
3772
4053
  template <bool need_check> static __global__ void
3773
- #if __CUDA_ARCH__ < CC_TURING
4054
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4055
+ #if defined(RDNA3) || defined(RDNA2)
4056
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4057
+ #endif // defined(RDNA3) || defined(RDNA2)
4058
+ #elif __CUDA_ARCH__ < CC_TURING
3774
4059
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3775
4060
  #endif // __CUDA_ARCH__ < CC_TURING
3776
4061
  mul_mat_q6_K(
3777
4062
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3778
4063
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3779
4064
 
3780
- #if __CUDA_ARCH__ >= CC_TURING
4065
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4066
+ #if defined(RDNA3) || defined(RDNA2)
4067
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4068
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4069
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4070
+ #else
4071
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4072
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4073
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4074
+ #endif // defined(RDNA3) || defined(RDNA2)
4075
+
4076
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4077
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4078
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4079
+
4080
+ #elif __CUDA_ARCH__ >= CC_TURING
3781
4081
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3782
4082
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3783
4083
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4086,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4086
4386
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4087
4387
  }
4088
4388
 
4089
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4389
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4090
4391
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4091
4392
  const int half_n_dims = ncols/4;
4092
4393
 
@@ -4098,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4098
4399
  const int i = row*ncols + col;
4099
4400
 
4100
4401
  const float col_theta_scale = powf(theta_scale, col);
4402
+ const float p = p0 + p_delta*(row/p_delta_rows);
4101
4403
 
4102
- const float theta = p*col_theta_scale;
4404
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4103
4405
  const float sin_theta = sinf(theta);
4104
4406
  const float cos_theta = cosf(theta);
4105
4407
 
@@ -4109,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4109
4411
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4110
4412
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4111
4413
 
4112
- const float block_theta = block_p*col_theta_scale;
4414
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4113
4415
  const float sin_block_theta = sinf(block_theta);
4114
4416
  const float cos_block_theta = cosf(block_theta);
4115
4417
 
@@ -4558,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4558
4860
  const int compute_capability = g_compute_capabilities[id];
4559
4861
 
4560
4862
  int mmq_x, mmq_y, nwarps;
4561
- if (compute_capability >= CC_TURING) {
4863
+ if (compute_capability >= CC_RDNA2) {
4864
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4865
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4866
+ nwarps = NWARPS_Q4_0_RDNA2;
4867
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4868
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4869
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4870
+ nwarps = NWARPS_Q4_0_RDNA1;
4871
+ } else if (compute_capability >= CC_TURING) {
4562
4872
  mmq_x = MMQ_X_Q4_0_AMPERE;
4563
4873
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4564
4874
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4595,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4595
4905
  const int compute_capability = g_compute_capabilities[id];
4596
4906
 
4597
4907
  int mmq_x, mmq_y, nwarps;
4598
- if (compute_capability >= CC_TURING) {
4908
+ if (compute_capability >= CC_RDNA2) {
4909
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4910
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4911
+ nwarps = NWARPS_Q4_1_RDNA2;
4912
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4913
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4914
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4915
+ nwarps = NWARPS_Q4_1_RDNA1;
4916
+ } else if (compute_capability >= CC_TURING) {
4599
4917
  mmq_x = MMQ_X_Q4_1_AMPERE;
4600
4918
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4601
4919
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4632,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4632
4950
  const int compute_capability = g_compute_capabilities[id];
4633
4951
 
4634
4952
  int mmq_x, mmq_y, nwarps;
4635
- if (compute_capability >= CC_TURING) {
4953
+ if (compute_capability >= CC_RDNA2) {
4954
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4955
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4956
+ nwarps = NWARPS_Q5_0_RDNA2;
4957
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4958
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4959
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4960
+ nwarps = NWARPS_Q5_0_RDNA1;
4961
+ } else if (compute_capability >= CC_TURING) {
4636
4962
  mmq_x = MMQ_X_Q5_0_AMPERE;
4637
4963
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4638
4964
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4669,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4669
4995
  const int compute_capability = g_compute_capabilities[id];
4670
4996
 
4671
4997
  int mmq_x, mmq_y, nwarps;
4672
- if (compute_capability >= CC_TURING) {
4998
+ if (compute_capability >= CC_RDNA2) {
4999
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5000
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5001
+ nwarps = NWARPS_Q5_1_RDNA2;
5002
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5003
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5004
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5005
+ nwarps = NWARPS_Q5_1_RDNA1;
5006
+ } else if (compute_capability >= CC_TURING) {
4673
5007
  mmq_x = MMQ_X_Q5_1_AMPERE;
4674
5008
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4675
5009
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4706,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4706
5040
  const int compute_capability = g_compute_capabilities[id];
4707
5041
 
4708
5042
  int mmq_x, mmq_y, nwarps;
4709
- if (compute_capability >= CC_TURING) {
5043
+ if (compute_capability >= CC_RDNA2) {
5044
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5045
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5046
+ nwarps = NWARPS_Q8_0_RDNA2;
5047
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5048
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5049
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5050
+ nwarps = NWARPS_Q8_0_RDNA1;
5051
+ } else if (compute_capability >= CC_TURING) {
4710
5052
  mmq_x = MMQ_X_Q8_0_AMPERE;
4711
5053
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4712
5054
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4743,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4743
5085
  const int compute_capability = g_compute_capabilities[id];
4744
5086
 
4745
5087
  int mmq_x, mmq_y, nwarps;
4746
- if (compute_capability >= CC_TURING) {
5088
+ if (compute_capability >= CC_RDNA2) {
5089
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5090
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5091
+ nwarps = NWARPS_Q2_K_RDNA2;
5092
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5093
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5094
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5095
+ nwarps = NWARPS_Q2_K_RDNA1;
5096
+ } else if (compute_capability >= CC_TURING) {
4747
5097
  mmq_x = MMQ_X_Q2_K_AMPERE;
4748
5098
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4749
5099
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4782,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4782
5132
  const int compute_capability = g_compute_capabilities[id];
4783
5133
 
4784
5134
  int mmq_x, mmq_y, nwarps;
4785
- if (compute_capability >= CC_TURING) {
5135
+ if (compute_capability >= CC_RDNA2) {
5136
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5137
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5138
+ nwarps = NWARPS_Q3_K_RDNA2;
5139
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5140
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5141
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5142
+ nwarps = NWARPS_Q3_K_RDNA1;
5143
+ } else if (compute_capability >= CC_TURING) {
4786
5144
  mmq_x = MMQ_X_Q3_K_AMPERE;
4787
5145
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4788
5146
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4820,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4820
5178
  const int compute_capability = g_compute_capabilities[id];
4821
5179
 
4822
5180
  int mmq_x, mmq_y, nwarps;
4823
- if (compute_capability >= CC_TURING) {
5181
+ if (compute_capability >= CC_RDNA2) {
5182
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5183
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5184
+ nwarps = NWARPS_Q4_K_RDNA2;
5185
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5186
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5187
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5188
+ nwarps = NWARPS_Q4_K_RDNA1;
5189
+ } else if (compute_capability >= CC_TURING) {
4824
5190
  mmq_x = MMQ_X_Q4_K_AMPERE;
4825
5191
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4826
5192
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4857,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4857
5223
  const int compute_capability = g_compute_capabilities[id];
4858
5224
 
4859
5225
  int mmq_x, mmq_y, nwarps;
4860
- if (compute_capability >= CC_TURING) {
5226
+ if (compute_capability >= CC_RDNA2) {
5227
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5228
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5229
+ nwarps = NWARPS_Q5_K_RDNA2;
5230
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5231
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5232
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5233
+ nwarps = NWARPS_Q5_K_RDNA1;
5234
+ } else if (compute_capability >= CC_TURING) {
4861
5235
  mmq_x = MMQ_X_Q5_K_AMPERE;
4862
5236
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4863
5237
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4894,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4894
5268
  const int compute_capability = g_compute_capabilities[id];
4895
5269
 
4896
5270
  int mmq_x, mmq_y, nwarps;
4897
- if (compute_capability >= CC_TURING) {
5271
+ if (compute_capability >= CC_RDNA2) {
5272
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5273
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5274
+ nwarps = NWARPS_Q6_K_RDNA2;
5275
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5276
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5277
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5278
+ nwarps = NWARPS_Q6_K_RDNA1;
5279
+ } else if (compute_capability >= CC_TURING) {
4898
5280
  mmq_x = MMQ_X_Q6_K_AMPERE;
4899
5281
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4900
5282
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4984,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4984
5366
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4985
5367
  }
4986
5368
 
4987
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4988
- GGML_ASSERT(nrows % 4 == 0);
4989
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4990
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5369
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
+ GGML_ASSERT(ncols % 4 == 0);
5372
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4991
5374
  const dim3 block_nums(num_blocks_x, nrows, 1);
4992
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5375
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4993
5376
  }
4994
5377
 
4995
5378
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5127,25 +5510,30 @@ void ggml_init_cublas() {
5127
5510
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5128
5511
  int64_t total_vram = 0;
5129
5512
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5130
- for (int id = 0; id < g_device_count; ++id) {
5513
+ for (int64_t id = 0; id < g_device_count; ++id) {
5131
5514
  cudaDeviceProp prop;
5132
5515
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5133
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5516
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5134
5517
 
5135
5518
  g_tensor_split[id] = total_vram;
5136
5519
  total_vram += prop.totalGlobalMem;
5137
-
5520
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5521
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5522
+ #else
5138
5523
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5524
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5139
5525
  }
5140
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5141
5527
  g_tensor_split[id] /= total_vram;
5142
5528
  }
5143
5529
 
5144
- for (int id = 0; id < g_device_count; ++id) {
5145
- CUDA_CHECK(cudaSetDevice(id));
5530
+ for (int64_t id = 0; id < g_device_count; ++id) {
5531
+ CUDA_CHECK(ggml_cuda_set_device(id));
5146
5532
 
5147
- // create main stream
5148
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5533
+ // create cuda streams
5534
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5535
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5536
+ }
5149
5537
 
5150
5538
  // create cublas handle
5151
5539
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5214,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5214
5602
  if (src->backend == GGML_BACKEND_CPU) {
5215
5603
  kind = cudaMemcpyHostToDevice;
5216
5604
  src_ptr = (char *) src->data;
5217
- } else if (src->backend == GGML_BACKEND_GPU) {
5605
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5606
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5218
5607
  kind = cudaMemcpyDeviceToDevice;
5219
5608
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5220
5609
  int id;
@@ -5253,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5253
5642
  }
5254
5643
 
5255
5644
  inline void ggml_cuda_op_add(
5256
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5257
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5258
- cudaStream_t & cudaStream_main){
5645
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5646
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5259
5647
 
5260
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5261
- GGML_ASSERT(src1_ddf_i != nullptr);
5262
- GGML_ASSERT(dst_ddf_i != nullptr);
5263
-
5264
- const int64_t ne00 = src0->ne[0];
5265
- const int64_t i01_diff = i01_high - i01_low;
5648
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5266
5649
 
5267
5650
  const int64_t ne10 = src1->ne[0];
5268
5651
  const int64_t ne11 = src1->ne[1];
5269
5652
 
5270
- // compute
5271
5653
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5272
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5654
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5273
5655
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5274
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5656
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5275
5657
  } else {
5276
5658
  GGML_ASSERT(false);
5277
5659
  }
5278
5660
 
5279
5661
  (void) src1;
5280
5662
  (void) dst;
5281
- (void) src0_ddq_i;
5282
- (void) i02;
5283
- (void) i1;
5284
5663
  }
5285
5664
 
5286
5665
  inline void ggml_cuda_op_mul(
5287
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5288
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5289
- cudaStream_t & cudaStream_main){
5290
-
5291
- GGML_ASSERT(src0_ddf_i != nullptr);
5292
- GGML_ASSERT(src1_ddf_i != nullptr);
5293
- GGML_ASSERT(dst_ddf_i != nullptr);
5666
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5667
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5294
5668
 
5295
- const int64_t ne00 = src0->ne[0];
5296
- const int64_t i01_diff = i01_high - i01_low;
5669
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5670
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5671
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5297
5672
 
5298
5673
  const int64_t ne10 = src1->ne[0];
5299
5674
  const int64_t ne11 = src1->ne[1];
5300
5675
 
5301
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5676
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5302
5677
 
5303
5678
  (void) dst;
5304
- (void) src0_ddq_i;
5305
- (void) i02;
5306
- (void) i1;
5307
5679
  }
5308
5680
 
5309
5681
  inline void ggml_cuda_op_gelu(
5310
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5311
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5312
- cudaStream_t & cudaStream_main){
5682
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5683
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5313
5684
 
5314
- GGML_ASSERT(src0_ddf_i != nullptr);
5315
- GGML_ASSERT(dst_ddf_i != nullptr);
5685
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5686
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5316
5687
 
5317
- const int64_t ne00 = src0->ne[0];
5318
- const int64_t i01_diff = i01_high - i01_low;
5319
-
5320
- // compute
5321
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5688
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5322
5689
 
5323
5690
  (void) src1;
5324
5691
  (void) dst;
5325
- (void) src0_ddq_i;
5326
- (void) src1_ddf_i;
5327
- (void) i02;
5328
- (void) i1;
5692
+ (void) src1_dd;
5329
5693
  }
5330
5694
 
5331
5695
  inline void ggml_cuda_op_silu(
5332
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5333
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5334
- cudaStream_t & cudaStream_main){
5696
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5697
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5335
5698
 
5336
- GGML_ASSERT(src0_ddf_i != nullptr);
5337
- GGML_ASSERT(dst_ddf_i != nullptr);
5699
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5700
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5338
5701
 
5339
- const int64_t ne00 = src0->ne[0];
5340
- const int64_t i01_diff = i01_high - i01_low;
5341
-
5342
- // compute
5343
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5702
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5344
5703
 
5345
5704
  (void) src1;
5346
5705
  (void) dst;
5347
- (void) src0_ddq_i;
5348
- (void) src1_ddf_i;
5349
- (void) i02;
5350
- (void) i1;
5706
+ (void) src1_dd;
5351
5707
  }
5352
5708
 
5353
5709
  inline void ggml_cuda_op_norm(
5354
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5355
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5356
- cudaStream_t & cudaStream_main){
5710
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5711
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5357
5712
 
5358
- GGML_ASSERT(src0_ddf_i != nullptr);
5359
- GGML_ASSERT(dst_ddf_i != nullptr);
5713
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5714
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5360
5715
 
5361
5716
  const int64_t ne00 = src0->ne[0];
5362
- const int64_t i01_diff = i01_high - i01_low;
5717
+ const int64_t nrows = ggml_nrows(src0);
5363
5718
 
5364
- // compute
5365
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5719
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5366
5720
 
5367
5721
  (void) src1;
5368
5722
  (void) dst;
5369
- (void) src0_ddq_i;
5370
- (void) src1_ddf_i;
5371
- (void) i02;
5372
- (void) i1;
5723
+ (void) src1_dd;
5373
5724
  }
5374
5725
 
5375
5726
  inline void ggml_cuda_op_rms_norm(
5376
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5377
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5378
- cudaStream_t & cudaStream_main){
5727
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5728
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5379
5729
 
5380
- GGML_ASSERT(src0_ddf_i != nullptr);
5381
- GGML_ASSERT(dst_ddf_i != nullptr);
5730
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5731
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5382
5732
 
5383
5733
  const int64_t ne00 = src0->ne[0];
5384
- const int64_t i01_diff = i01_high - i01_low;
5734
+ const int64_t nrows = ggml_nrows(src0);
5385
5735
 
5386
5736
  float eps;
5387
5737
  memcpy(&eps, dst->op_params, sizeof(float));
5388
5738
 
5389
- // compute
5390
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5739
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5391
5740
 
5392
5741
  (void) src1;
5393
5742
  (void) dst;
5394
- (void) src0_ddq_i;
5395
- (void) src1_ddf_i;
5396
- (void) i02;
5397
- (void) i1;
5743
+ (void) src1_dd;
5398
5744
  }
5399
5745
 
5400
5746
  inline void ggml_cuda_op_mul_mat_q(
5401
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5402
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5403
- cudaStream_t & cudaStream_main){
5404
-
5405
- GGML_ASSERT(src0_ddq_i != nullptr);
5406
- GGML_ASSERT(src1_ddf_i != nullptr);
5407
- GGML_ASSERT(dst_ddf_i != nullptr);
5747
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5748
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5749
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5408
5750
 
5409
5751
  const int64_t ne00 = src0->ne[0];
5410
5752
 
5411
5753
  const int64_t ne10 = src1->ne[0];
5412
- const int64_t ne11 = src1->ne[1];
5413
5754
  GGML_ASSERT(ne10 % QK8_1 == 0);
5414
5755
 
5415
5756
  const int64_t ne0 = dst->ne[0];
5416
5757
 
5417
- const int64_t i01_diff = i01_high - i01_low;
5758
+ const int64_t row_diff = row_high - row_low;
5418
5759
 
5419
5760
  int id;
5420
5761
  CUDA_CHECK(cudaGetDevice(&id));
5421
5762
 
5422
5763
  // the main device has a larger memory buffer to hold the results from all GPUs
5423
5764
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5424
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5425
-
5426
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5427
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5428
- size_t as;
5429
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5430
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5765
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5431
5766
 
5432
5767
  switch (src0->type) {
5433
5768
  case GGML_TYPE_Q4_0:
5434
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5769
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5435
5770
  break;
5436
5771
  case GGML_TYPE_Q4_1:
5437
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5772
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5438
5773
  break;
5439
5774
  case GGML_TYPE_Q5_0:
5440
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5775
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5441
5776
  break;
5442
5777
  case GGML_TYPE_Q5_1:
5443
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5778
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5444
5779
  break;
5445
5780
  case GGML_TYPE_Q8_0:
5446
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5781
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5447
5782
  break;
5448
5783
  case GGML_TYPE_Q2_K:
5449
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5784
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5450
5785
  break;
5451
5786
  case GGML_TYPE_Q3_K:
5452
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5787
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5453
5788
  break;
5454
5789
  case GGML_TYPE_Q4_K:
5455
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5790
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5456
5791
  break;
5457
5792
  case GGML_TYPE_Q5_K:
5458
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5793
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5459
5794
  break;
5460
5795
  case GGML_TYPE_Q6_K:
5461
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5796
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5462
5797
  break;
5463
5798
  default:
5464
5799
  GGML_ASSERT(false);
5465
5800
  break;
5466
5801
  }
5467
5802
 
5468
- ggml_cuda_pool_free(src1_q8_1, as);
5469
-
5470
5803
  (void) src1;
5471
5804
  (void) dst;
5472
- (void) src0_ddf_i;
5473
- (void) i02;
5474
- (void) i1;
5805
+ (void) src1_ddf_i;
5475
5806
  }
5476
5807
 
5477
5808
  static int64_t get_row_rounding(ggml_type type) {
5478
- int max_compute_capability = INT_MIN;
5479
- for (int id = 0; id < g_device_count; ++id) {
5480
- if (max_compute_capability < g_compute_capabilities[id]
5481
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5482
- max_compute_capability = g_compute_capabilities[id];
5809
+ int64_t min_compute_capability = INT_MAX;
5810
+ int64_t max_compute_capability = INT_MIN;
5811
+ for (int64_t id = 0; id < g_device_count; ++id) {
5812
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5813
+ if (min_compute_capability > g_compute_capabilities[id]) {
5814
+ min_compute_capability = g_compute_capabilities[id];
5815
+ }
5816
+ if (max_compute_capability < g_compute_capabilities[id]) {
5817
+ max_compute_capability = g_compute_capabilities[id];
5818
+ }
5483
5819
  }
5484
5820
  }
5485
5821
 
5822
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5823
+ switch(type) {
5824
+ case GGML_TYPE_Q4_0:
5825
+ case GGML_TYPE_Q4_1:
5826
+ case GGML_TYPE_Q5_0:
5827
+ case GGML_TYPE_Q5_1:
5828
+ case GGML_TYPE_Q8_0:
5829
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5830
+ case GGML_TYPE_F16:
5831
+ return 1;
5832
+ case GGML_TYPE_Q2_K:
5833
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5834
+ case GGML_TYPE_Q3_K:
5835
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5836
+ case GGML_TYPE_Q4_K:
5837
+ case GGML_TYPE_Q5_K:
5838
+ case GGML_TYPE_Q6_K:
5839
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5840
+ default:
5841
+ GGML_ASSERT(false);
5842
+ }
5843
+ #else
5486
5844
  switch(type) {
5487
5845
  case GGML_TYPE_Q4_0:
5488
5846
  case GGML_TYPE_Q4_1:
@@ -5503,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
5503
5861
  default:
5504
5862
  GGML_ASSERT(false);
5505
5863
  }
5864
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5506
5865
  }
5507
5866
 
5508
- inline void ggml_cuda_op_mul_mat_vec(
5509
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5510
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5511
- cudaStream_t & cudaStream_main){
5512
-
5513
- GGML_ASSERT(src0_ddq_i != nullptr);
5514
- GGML_ASSERT(src1_ddf_i != nullptr);
5515
- GGML_ASSERT(dst_ddf_i != nullptr);
5867
+ inline void ggml_cuda_op_mul_mat_vec_q(
5868
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5869
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5870
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5516
5871
 
5517
5872
  const int64_t ne00 = src0->ne[0];
5518
- const int64_t nrows = i01_high - i01_low;
5873
+ const int64_t row_diff = row_high - row_low;
5519
5874
 
5520
- #ifdef GGML_CUDA_FORCE_DMMV
5521
- const bool use_mul_mat_vec_q = false;
5522
- (void) g_compute_capabilities[0];
5523
- #else
5524
- int id;
5525
- CUDA_CHECK(cudaGetDevice(&id));
5875
+ switch (src0->type) {
5876
+ case GGML_TYPE_Q4_0:
5877
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5878
+ break;
5879
+ case GGML_TYPE_Q4_1:
5880
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5881
+ break;
5882
+ case GGML_TYPE_Q5_0:
5883
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5884
+ break;
5885
+ case GGML_TYPE_Q5_1:
5886
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5887
+ break;
5888
+ case GGML_TYPE_Q8_0:
5889
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5890
+ break;
5891
+ case GGML_TYPE_Q2_K:
5892
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5893
+ break;
5894
+ case GGML_TYPE_Q3_K:
5895
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5896
+ break;
5897
+ case GGML_TYPE_Q4_K:
5898
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5899
+ break;
5900
+ case GGML_TYPE_Q5_K:
5901
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5902
+ break;
5903
+ case GGML_TYPE_Q6_K:
5904
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5905
+ break;
5906
+ default:
5907
+ GGML_ASSERT(false);
5908
+ break;
5909
+ }
5526
5910
 
5527
- bool mul_mat_vec_q_implemented =
5528
- src0->type == GGML_TYPE_Q4_0 ||
5529
- src0->type == GGML_TYPE_Q4_1 ||
5530
- src0->type == GGML_TYPE_Q5_0 ||
5531
- src0->type == GGML_TYPE_Q5_1 ||
5532
- src0->type == GGML_TYPE_Q8_0;
5533
- #if QK_K == 256
5534
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5535
- src0->type == GGML_TYPE_Q2_K ||
5536
- src0->type == GGML_TYPE_Q3_K ||
5537
- src0->type == GGML_TYPE_Q4_K ||
5538
- src0->type == GGML_TYPE_Q5_K ||
5539
- src0->type == GGML_TYPE_Q6_K;
5540
- #endif // QK_K == 256
5541
-
5542
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5543
- #endif
5911
+ (void) src1;
5912
+ (void) dst;
5913
+ (void) src1_ddf_i;
5914
+ (void) src1_ncols;
5915
+ (void) src1_padded_row_size;
5916
+ }
5544
5917
 
5545
- if (use_mul_mat_vec_q) {
5546
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5547
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5548
- size_t as;
5549
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5550
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5551
-
5552
- switch (src0->type) {
5553
- case GGML_TYPE_Q4_0:
5554
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q4_1:
5557
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_0:
5560
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q5_1:
5563
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q8_0:
5566
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q2_K:
5569
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q3_K:
5572
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q4_K:
5575
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q5_K:
5578
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_Q6_K:
5581
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5918
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5919
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5920
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5921
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5587
5922
 
5588
- ggml_cuda_pool_free(src1_q8_1, as);
5589
- } else {
5590
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5923
+ const int64_t ne00 = src0->ne[0];
5924
+ const int64_t row_diff = row_high - row_low;
5925
+
5926
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5591
5927
  #ifdef GGML_CUDA_F16
5592
- size_t ash;
5593
- dfloat * src1_dfloat = nullptr; // dfloat == half
5594
-
5595
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5596
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5597
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5598
-
5599
- if (src1_convert_f16) {
5600
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5601
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5602
- ne00, 1, sizeof(float), 0, 0,
5603
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5604
- }
5928
+ size_t ash;
5929
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5930
+
5931
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5932
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5933
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5934
+
5935
+ if (src1_convert_f16) {
5936
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5937
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5938
+ ne00, 1, sizeof(float), 0, 0,
5939
+ ne00, 1, sizeof(half), 0, 0, stream);
5940
+ }
5605
5941
  #else
5606
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5942
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5607
5943
  #endif // GGML_CUDA_F16
5608
5944
 
5609
- switch (src0->type) {
5610
- case GGML_TYPE_Q4_0:
5611
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5612
- break;
5613
- case GGML_TYPE_Q4_1:
5614
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5615
- break;
5616
- case GGML_TYPE_Q5_0:
5617
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5618
- break;
5619
- case GGML_TYPE_Q5_1:
5620
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5621
- break;
5622
- case GGML_TYPE_Q8_0:
5623
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5624
- break;
5625
- case GGML_TYPE_Q2_K:
5626
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5627
- break;
5628
- case GGML_TYPE_Q3_K:
5629
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5630
- break;
5631
- case GGML_TYPE_Q4_K:
5632
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5633
- break;
5634
- case GGML_TYPE_Q5_K:
5635
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5636
- break;
5637
- case GGML_TYPE_Q6_K:
5638
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5639
- break;
5640
- case GGML_TYPE_F16:
5641
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5642
- break;
5643
- default:
5644
- GGML_ASSERT(false);
5645
- break;
5646
- }
5945
+ switch (src0->type) {
5946
+ case GGML_TYPE_Q4_0:
5947
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5948
+ break;
5949
+ case GGML_TYPE_Q4_1:
5950
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5951
+ break;
5952
+ case GGML_TYPE_Q5_0:
5953
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5954
+ break;
5955
+ case GGML_TYPE_Q5_1:
5956
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5957
+ break;
5958
+ case GGML_TYPE_Q8_0:
5959
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5960
+ break;
5961
+ case GGML_TYPE_Q2_K:
5962
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5963
+ break;
5964
+ case GGML_TYPE_Q3_K:
5965
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5966
+ break;
5967
+ case GGML_TYPE_Q4_K:
5968
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5969
+ break;
5970
+ case GGML_TYPE_Q5_K:
5971
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5972
+ break;
5973
+ case GGML_TYPE_Q6_K:
5974
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5975
+ break;
5976
+ case GGML_TYPE_F16:
5977
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5978
+ break;
5979
+ default:
5980
+ GGML_ASSERT(false);
5981
+ break;
5982
+ }
5647
5983
 
5648
5984
  #ifdef GGML_CUDA_F16
5649
- if (src1_convert_f16) {
5650
- ggml_cuda_pool_free(src1_dfloat, ash);
5651
- }
5652
- #endif // GGML_CUDA_F16
5985
+ if (src1_convert_f16) {
5986
+ ggml_cuda_pool_free(src1_dfloat, ash);
5653
5987
  }
5988
+ #endif // GGML_CUDA_F16
5654
5989
 
5655
5990
  (void) src1;
5656
5991
  (void) dst;
5657
- (void) src0_ddf_i;
5658
- (void) i02;
5659
- (void) i1;
5992
+ (void) src1_ddq_i;
5993
+ (void) src1_ncols;
5994
+ (void) src1_padded_row_size;
5660
5995
  }
5661
5996
 
5662
5997
  inline void ggml_cuda_op_mul_mat_cublas(
5663
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5664
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5665
- cudaStream_t & cudaStream_main){
5998
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5999
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6000
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5666
6001
 
5667
- GGML_ASSERT(src0_ddf_i != nullptr);
6002
+ GGML_ASSERT(src0_dd_i != nullptr);
5668
6003
  GGML_ASSERT(src1_ddf_i != nullptr);
5669
- GGML_ASSERT(dst_ddf_i != nullptr);
6004
+ GGML_ASSERT(dst_dd_i != nullptr);
5670
6005
 
5671
6006
  const float alpha = 1.0f;
5672
6007
  const float beta = 0.0f;
@@ -5674,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5674
6009
  const int64_t ne00 = src0->ne[0];
5675
6010
 
5676
6011
  const int64_t ne10 = src1->ne[0];
5677
- const int64_t ne11 = src1->ne[1];
5678
6012
 
5679
6013
  const int64_t ne0 = dst->ne[0];
5680
- const int64_t i01_diff = i01_high - i01_low;
6014
+ const int64_t row_diff = row_high - row_low;
6015
+
6016
+ float * src0_ddq_as_f32;
6017
+ size_t src0_as = 0;
6018
+
6019
+ if (src0->type != GGML_TYPE_F32) {
6020
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
+ }
6024
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5681
6025
 
5682
6026
  int id;
5683
6027
  CUDA_CHECK(cudaGetDevice(&id));
5684
6028
 
5685
6029
  // the main device has a larger memory buffer to hold the results from all GPUs
5686
6030
  // ldc == nrows of the matrix that cuBLAS writes into
5687
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6031
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5688
6032
 
5689
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6033
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5690
6034
  CUBLAS_CHECK(
5691
6035
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5692
- i01_diff, ne11, ne10,
6036
+ row_diff, src1_ncols, ne10,
5693
6037
  &alpha, src0_ddf_i, ne00,
5694
- src1_ddf_i, ne10,
5695
- &beta, dst_ddf_i, ldc));
6038
+ src1_ddf_i, ne10,
6039
+ &beta, dst_dd_i, ldc));
6040
+
6041
+ if (src0_as > 0) {
6042
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6043
+ }
5696
6044
 
5697
6045
  (void) dst;
5698
- (void) src0_ddq_i;
5699
- (void) i02;
5700
- (void) i1;
6046
+ (void) src1_ddq_i;
6047
+ (void) src1_padded_row_size;
5701
6048
  }
5702
6049
 
5703
6050
  inline void ggml_cuda_op_rope(
5704
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5705
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5706
- cudaStream_t & cudaStream_main){
6051
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5707
6053
 
5708
- GGML_ASSERT(src0_ddf_i != nullptr);
5709
- GGML_ASSERT(dst_ddf_i != nullptr);
6054
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5710
6056
 
5711
6057
  const int64_t ne00 = src0->ne[0];
5712
6058
  const int64_t ne01 = src0->ne[1];
5713
- const int64_t i01_diff = i01_high - i01_low;
6059
+ const int64_t nrows = ggml_nrows(src0);
5714
6060
 
5715
6061
  const int n_past = ((int32_t *) dst->op_params)[0];
5716
6062
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5723,44 +6069,37 @@ inline void ggml_cuda_op_rope(
5723
6069
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5724
6070
 
5725
6071
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5726
6073
 
5727
6074
  const bool is_neox = mode & 2;
5728
6075
  const bool is_glm = mode & 4;
5729
6076
 
5730
6077
  // compute
5731
6078
  if (is_glm) {
5732
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5733
- const float id_p = min(p, n_ctx - 2.f);
5734
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5735
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6079
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5736
6080
  } else if (is_neox) {
5737
6081
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5738
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5739
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6082
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5740
6083
  } else {
5741
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5742
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6084
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5743
6085
  }
5744
6086
 
5745
6087
  (void) src1;
5746
6088
  (void) dst;
5747
- (void) src0_ddq_i;
5748
- (void) src1_ddf_i;
5749
- (void) i1;
6089
+ (void) src1_dd;
5750
6090
  }
5751
6091
 
5752
6092
  inline void ggml_cuda_op_alibi(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6093
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6094
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6095
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6096
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6097
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6098
 
5760
6099
  const int64_t ne00 = src0->ne[0];
5761
6100
  const int64_t ne01 = src0->ne[1];
5762
6101
  const int64_t ne02 = src0->ne[2];
5763
- const int64_t i01_diff = i01_high - i01_low;
6102
+ const int64_t nrows = ggml_nrows(src0);
5764
6103
 
5765
6104
  const int n_past = ((int32_t *) dst->op_params)[0];
5766
6105
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5775,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
5775
6114
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5776
6115
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5777
6116
 
5778
- // compute
5779
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6117
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5780
6118
 
5781
6119
  (void) src1;
5782
- (void) src0_ddq_i;
5783
- (void) src1_ddf_i;
5784
- (void) i1;
6120
+ (void) src1_dd;
5785
6121
  }
5786
6122
 
5787
6123
  inline void ggml_cuda_op_diag_mask_inf(
5788
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5789
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5790
- cudaStream_t & cudaStream_main){
6124
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6125
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5791
6126
 
5792
- GGML_ASSERT(src0_ddf_i != nullptr);
5793
- GGML_ASSERT(dst_ddf_i != nullptr);
6127
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6128
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5794
6129
 
5795
6130
  const int64_t ne00 = src0->ne[0];
5796
6131
  const int64_t ne01 = src0->ne[1];
5797
- const int64_t i01_diff = i01_high - i01_low;
6132
+ const int nrows0 = ggml_nrows(src0);
5798
6133
 
5799
6134
  const int n_past = ((int32_t *) dst->op_params)[0];
5800
6135
 
5801
- // compute
5802
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6136
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5803
6137
 
5804
6138
  (void) src1;
5805
6139
  (void) dst;
5806
- (void) src0_ddq_i;
5807
- (void) src1_ddf_i;
5808
- (void) i02;
5809
- (void) i1;
6140
+ (void) src1_dd;
5810
6141
  }
5811
6142
 
5812
6143
  inline void ggml_cuda_op_soft_max(
5813
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5814
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5815
- cudaStream_t & cudaStream_main){
6144
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6145
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5816
6146
 
5817
- GGML_ASSERT(src0_ddf_i != nullptr);
5818
- GGML_ASSERT(dst_ddf_i != nullptr);
6147
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6148
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5819
6149
 
5820
6150
  const int64_t ne00 = src0->ne[0];
5821
- const int64_t i01_diff = i01_high - i01_low;
6151
+ const int64_t nrows = ggml_nrows(src0);
5822
6152
 
5823
- // compute
5824
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6153
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5825
6154
 
5826
6155
  (void) src1;
5827
6156
  (void) dst;
5828
- (void) src0_ddq_i;
5829
- (void) src1_ddf_i;
5830
- (void) i02;
5831
- (void) i1;
6157
+ (void) src1_dd;
5832
6158
  }
5833
6159
 
5834
6160
  inline void ggml_cuda_op_scale(
5835
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5836
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5837
- cudaStream_t & cudaStream_main){
6161
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6162
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5838
6163
 
5839
- GGML_ASSERT(src0_ddf_i != nullptr);
5840
- GGML_ASSERT(dst_ddf_i != nullptr);
6164
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6165
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6166
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5841
6167
 
5842
6168
  const float scale = ((float *) src1->data)[0];
5843
6169
 
5844
- const int64_t ne00 = src0->ne[0];
5845
- const int64_t i01_diff = i01_high - i01_low;
5846
-
5847
- // compute
5848
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6170
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5849
6171
  CUDA_CHECK(cudaGetLastError());
5850
6172
 
5851
6173
  (void) src1;
5852
6174
  (void) dst;
5853
- (void) src0_ddq_i;
5854
- (void) src1_ddf_i;
5855
- (void) i02;
5856
- (void) i1;
6175
+ (void) src1_dd;
6176
+ }
6177
+
6178
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6179
+ const int64_t nrows0 = ggml_nrows(src0);
6180
+
6181
+ const bool use_src1 = src1 != nullptr;
6182
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6183
+
6184
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6185
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6186
+
6187
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6188
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6189
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6190
+
6191
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6192
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6193
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6194
+
6195
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6196
+
6197
+ // dd = data device
6198
+ float * src0_ddf = nullptr;
6199
+ float * src1_ddf = nullptr;
6200
+ float * dst_ddf = nullptr;
6201
+
6202
+ // as = actual size
6203
+ size_t src0_asf = 0;
6204
+ size_t src1_asf = 0;
6205
+ size_t dst_asf = 0;
6206
+
6207
+ ggml_cuda_set_device(g_main_device);
6208
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6209
+
6210
+ if (src0_on_device) {
6211
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6212
+ } else {
6213
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6215
+ }
6216
+
6217
+ if (use_src1 && !src1_stays_on_host) {
6218
+ if (src1_on_device) {
6219
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6220
+ } else {
6221
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6222
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6223
+ }
6224
+ }
6225
+ if (dst_on_device) {
6226
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6227
+ } else {
6228
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6229
+ }
6230
+
6231
+ // do the computation
6232
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6233
+ CUDA_CHECK(cudaGetLastError());
6234
+
6235
+ // copy dst to host if necessary
6236
+ if (!dst_on_device) {
6237
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6238
+ }
6239
+
6240
+ if (src0_asf > 0) {
6241
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6242
+ }
6243
+ if (src1_asf > 0) {
6244
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6245
+ }
6246
+ if (dst_asf > 0) {
6247
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6248
+ }
6249
+
6250
+ if (dst->backend == GGML_BACKEND_CPU) {
6251
+ CUDA_CHECK(cudaDeviceSynchronize());
6252
+ }
5857
6253
  }
5858
6254
 
5859
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5860
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6255
+ static void ggml_cuda_op_mul_mat(
6256
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
+ const bool convert_src1_to_q8_1) {
6258
+
5861
6259
  const int64_t ne00 = src0->ne[0];
5862
6260
  const int64_t ne01 = src0->ne[1];
5863
6261
  const int64_t ne02 = src0->ne[2];
5864
6262
  const int64_t ne03 = src0->ne[3];
5865
6263
  const int64_t nrows0 = ggml_nrows(src0);
5866
6264
 
5867
- const bool use_src1 = src1 != nullptr;
5868
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5869
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5870
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5871
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5872
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6265
+ const int64_t ne10 = src1->ne[0];
6266
+ const int64_t ne11 = src1->ne[1];
6267
+ const int64_t ne12 = src1->ne[2];
6268
+ const int64_t ne13 = src1->ne[3];
6269
+ const int64_t nrows1 = ggml_nrows(src1);
5873
6270
 
5874
6271
  GGML_ASSERT(ne03 == ne13);
5875
6272
 
5876
6273
  const int64_t ne0 = dst->ne[0];
5877
6274
  const int64_t ne1 = dst->ne[1];
5878
6275
 
5879
- const int nb2 = dst->nb[2];
5880
- const int nb3 = dst->nb[3];
6276
+ const int nb2 = dst->nb[2];
6277
+ const int nb3 = dst->nb[3];
5881
6278
 
5882
6279
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5883
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6280
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5884
6281
 
5885
- // strides for iteration over dims 3 and 2
5886
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5887
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5888
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5889
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5890
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5891
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6282
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5892
6283
 
5893
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5894
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5895
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5896
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5897
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6284
+ const int64_t i02_divisor = ne12 / ne02;
5898
6285
 
5899
6286
  const size_t src0_ts = ggml_type_size(src0->type);
5900
6287
  const size_t src0_bs = ggml_blck_size(src0->type);
6288
+ const size_t q8_1_ts = sizeof(block_q8_1);
6289
+ const size_t q8_1_bs = QK8_1;
5901
6290
 
5902
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5903
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5904
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6291
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6292
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6293
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5905
6294
 
5906
6295
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5907
6296
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5908
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5909
6297
 
5910
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5911
- const bool src1_stays_on_host = use_src1 && (
5912
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6298
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6299
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6300
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5913
6301
 
5914
6302
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6303
+ GGML_ASSERT(!(split && ne02 > 1));
6304
+ GGML_ASSERT(!(split && ne03 > 1));
5915
6305
  GGML_ASSERT(!(split && ne02 < ne12));
5916
6306
 
5917
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5918
-
5919
6307
  // dd = data device
5920
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5921
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5922
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5923
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5924
-
5925
- // asq = actual size quantized, asf = actual size float
5926
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5927
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
6308
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6309
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6310
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6311
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6312
+
6313
+ // as = actual size
6314
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
5928
6315
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5929
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6316
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6317
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5930
6318
 
5931
- // if multiple devices are used they need to wait for the main device
5932
- // here an event is recorded that signifies that the main device has finished calculating the input data
5933
- if (split && g_device_count > 1) {
5934
- CUDA_CHECK(cudaSetDevice(g_main_device));
5935
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5936
- }
6319
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6320
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5937
6321
 
5938
- for (int id = 0; id < g_device_count; ++id) {
5939
- if (!split && id != g_main_device) {
5940
- continue;
5941
- }
6322
+ for (int64_t id = 0; id < g_device_count; ++id) {
6323
+ // by default, use all rows
6324
+ row_low[id] = 0;
6325
+ row_high[id] = ne01;
5942
6326
 
5943
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5944
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5945
-
5946
- int64_t row_low, row_high;
6327
+ // for multi GPU, get the row boundaries from tensor split
6328
+ // and round to mul_mat_q tile sizes
5947
6329
  if (split) {
5948
6330
  const int64_t rounding = get_row_rounding(src0->type);
5949
6331
 
5950
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5951
- row_low -= row_low % rounding;
6332
+ if (id != 0) {
6333
+ row_low[id] = ne01*g_tensor_split[id];
6334
+ row_low[id] -= row_low[id] % rounding;
6335
+ }
5952
6336
 
5953
- if (id == g_device_count - 1) {
5954
- row_high = nrows0;
5955
- } else {
5956
- row_high = nrows0*g_tensor_split[id + 1];
5957
- row_high -= row_high % rounding;
6337
+ if (id != g_device_count - 1) {
6338
+ row_high[id] = ne01*g_tensor_split[id + 1];
6339
+ row_high[id] -= row_high[id] % rounding;
5958
6340
  }
5959
- } else {
5960
- row_low = 0;
5961
- row_high = nrows0*i02_divisor;
5962
6341
  }
5963
- if (row_low == row_high) {
6342
+ }
6343
+
6344
+ for (int64_t id = 0; id < g_device_count; ++id) {
6345
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5964
6346
  continue;
5965
6347
  }
5966
6348
 
5967
- int64_t row_diff = row_high - row_low;
5968
-
5969
- cudaSetDevice(id);
5970
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6349
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6350
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5971
6351
 
5972
- // wait for main GPU data if necessary
5973
- if (split && id != g_main_device) {
5974
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5975
- }
6352
+ ggml_cuda_set_device(id);
6353
+ const cudaStream_t stream = g_cudaStreams[id][0];
5976
6354
 
5977
6355
  if (src0_on_device && src0_is_contiguous) {
5978
- if (src0_is_f32) {
5979
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5980
- } else {
5981
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5982
- }
6356
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5983
6357
  } else {
5984
- if (src0_is_f32) {
5985
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5986
- } else {
5987
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5988
- }
6358
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6359
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5989
6360
  }
5990
6361
 
5991
- if (src0_needs_f32 && !src0_is_f32) {
5992
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6362
+ if (src1_on_device && src1_is_contiguous) {
6363
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6364
+ } else {
6365
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5993
6366
  }
5994
6367
 
5995
- if (use_src1 && !src1_stays_on_host) {
5996
- if (src1_on_device && src1_is_contiguous) {
5997
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5998
- } else {
5999
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6368
+ if (convert_src1_to_q8_1) {
6369
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6370
+
6371
+ if (split && src1_on_device && src1_is_contiguous) {
6372
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6373
+ CUDA_CHECK(cudaGetLastError());
6000
6374
  }
6001
6375
  }
6376
+
6002
6377
  if (dst_on_device) {
6003
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6378
+ dst_dd[id] = (float *) dst_extra->data_device[id];
6004
6379
  } else {
6005
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
6006
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6380
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6381
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
6007
6382
  }
6383
+ }
6008
6384
 
6009
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
6010
- const int64_t i13 = i03 % ne13;
6011
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
6012
- const int64_t i12 = i02 % ne12;
6385
+ // if multiple devices are used they need to wait for the main device
6386
+ // here an event is recorded that signals that the main device has finished calculating the input data
6387
+ if (split && g_device_count > 1) {
6388
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6389
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6390
+ }
6013
6391
 
6014
- const int64_t i0 = i03*i02_max + i02;
6392
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6393
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6394
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6395
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
6015
6396
 
6016
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6017
- const int64_t i0_offset_low = row_low/rows_per_iter;
6018
- const int64_t i0_offset_high = row_high/rows_per_iter;
6397
+ for (int64_t id = 0; id < g_device_count; ++id) {
6398
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6399
+ continue;
6400
+ }
6019
6401
 
6020
- int64_t i01_low = 0;
6021
- int64_t i01_high = rows_per_iter;
6022
- if (split) {
6023
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
6024
- continue;
6025
- }
6026
- if (i0 == i0_offset_low) {
6027
- i01_low = row_low % rows_per_iter;
6028
- }
6029
- if (i0 == i0_offset_high) {
6030
- i01_high = row_high % rows_per_iter;
6031
- }
6032
- }
6402
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6403
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6404
+ const int64_t row_diff = row_high[id] - row_low[id];
6033
6405
 
6034
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6035
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6036
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6037
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6038
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
6039
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6406
+ ggml_cuda_set_device(id);
6407
+ const cudaStream_t stream = g_cudaStreams[id][is];
6040
6408
 
6041
- const int64_t i01_diff = i01_high - i01_low;
6042
- if (i01_diff == 0) {
6043
- continue;
6044
- }
6045
- const int64_t i11 = i13*ne12 + i12;
6409
+ // wait for main GPU data if necessary
6410
+ if (split && (id != g_main_device || is != 0)) {
6411
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6412
+ }
6413
+
6414
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6415
+ const int64_t i03 = i0 / ne12;
6416
+ const int64_t i02 = i0 % ne12;
6417
+
6418
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
6046
6419
 
6047
6420
  // for split tensors the data begins at i0 == i0_offset_low
6048
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6049
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6050
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6051
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6052
-
6053
- // for split tensors the data pointer needs to be rounded down
6054
- // to the bin edge for i03, i02 bins beyond the first
6055
- if (i0 - i0_offset_low > 0) {
6056
- GGML_ASSERT(!flatten_rows);
6057
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6058
- src0_ddf_i -= (row_low % ne01)*ne00;
6059
- dst_ddf_i -= (row_low % ne0)*ne1;
6060
- }
6421
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6422
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6423
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6424
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6061
6425
 
6062
6426
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6063
6427
  // in that case an offset on dst_ddf_i is needed
6064
6428
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6065
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6429
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6066
6430
  }
6067
6431
 
6068
6432
  // copy src0, src1 to device if necessary
6069
- if (use_src1 && !src1_stays_on_host) {
6070
- if (src1->backend == GGML_BACKEND_CPU) {
6071
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6072
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6073
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6074
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6075
- if (id != g_main_device) {
6076
- GGML_ASSERT(!flatten_rows);
6433
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6434
+ if (id != g_main_device) {
6435
+ if (convert_src1_to_q8_1) {
6436
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6437
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6438
+ cudaMemcpyDeviceToDevice, stream));
6439
+ } else {
6077
6440
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6078
- src1_ddf_i_source += i11*src1_stride;
6079
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6080
- cudaMemcpyDeviceToDevice, cudaStream_main));
6441
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6442
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6443
+ cudaMemcpyDeviceToDevice, stream));
6081
6444
  }
6082
- } else if (src1_on_device && !src1_is_contiguous) {
6083
- GGML_ASSERT(!split);
6084
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6085
- } else {
6086
- GGML_ASSERT(false);
6087
6445
  }
6446
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6447
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6448
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6449
+ } else {
6450
+ GGML_ASSERT(false);
6088
6451
  }
6089
6452
 
6090
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6091
- if (src0_is_f32) {
6092
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6093
- } else {
6094
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6095
- }
6453
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6454
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6455
+ CUDA_CHECK(cudaGetLastError());
6096
6456
  }
6097
6457
 
6098
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6099
- if (src0_needs_f32 && !src0_is_f32) {
6100
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6101
- CUDA_CHECK(cudaGetLastError());
6458
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6459
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6102
6460
  }
6103
6461
 
6104
6462
  // do the computation
6105
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6463
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6464
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6106
6465
  CUDA_CHECK(cudaGetLastError());
6107
6466
 
6108
6467
  // copy dst to host or other device if necessary
@@ -6124,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6124
6483
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6125
6484
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6126
6485
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6127
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6128
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6129
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6486
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6487
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6488
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6489
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6490
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6130
6491
  } else {
6131
6492
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6132
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6493
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6494
+ dhf_dst_i += src1_col_0*ne0;
6495
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6133
6496
  }
6134
6497
  }
6135
6498
 
6136
- // signify to main device that other device is done
6137
- if (split && g_device_count > 1 && id != g_main_device) {
6138
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6499
+ // add event for the main device to wait on until other device is done
6500
+ if (split && (id != g_main_device || is != 0)) {
6501
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6139
6502
  }
6140
6503
  }
6141
6504
  }
6142
6505
  }
6143
6506
 
6144
- // wait until each device is finished, then free their buffers
6145
- for (int id = 0; id < g_device_count; ++id) {
6146
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6147
- continue;
6148
- }
6149
-
6150
- CUDA_CHECK(cudaSetDevice(id));
6507
+ for (int64_t id = 0; id < g_device_count; ++id) {
6508
+ CUDA_CHECK(ggml_cuda_set_device(id));
6151
6509
 
6152
- if (src0_asq[id] > 0) {
6153
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6154
- }
6155
- if (src0_asf[id] > 0) {
6156
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6510
+ // free buffers again when done
6511
+ if (src0_as[id] > 0) {
6512
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6157
6513
  }
6158
6514
  if (src1_asf[id] > 0) {
6159
6515
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6160
6516
  }
6161
- if (dst_asf[id] > 0) {
6162
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6517
+ if (src1_asq[id] > 0) {
6518
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6519
+ }
6520
+ if (dst_as[id] > 0) {
6521
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6163
6522
  }
6164
6523
  }
6165
6524
 
6166
6525
  // main device waits for all other devices to be finished
6167
6526
  if (split && g_device_count > 1) {
6168
- CUDA_CHECK(cudaSetDevice(g_main_device));
6169
- for (int id = 0; id < g_device_count; ++id) {
6170
- if (id != g_main_device && src0_extra->events[id]) {
6171
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6527
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6528
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6529
+
6530
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
+ for (int64_t id = 0; id < g_device_count; ++id) {
6532
+ for (int64_t is = 0; is < is_max; ++is) {
6533
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6172
6534
  }
6173
6535
  }
6174
6536
  }
6175
6537
 
6176
6538
  if (dst->backend == GGML_BACKEND_CPU) {
6177
- CUDA_CHECK(cudaSetDevice(g_main_device));
6539
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6178
6540
  CUDA_CHECK(cudaDeviceSynchronize());
6179
6541
  }
6180
6542
  }
6181
6543
 
6182
6544
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6183
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6184
- // Due to flatten_rows == true this does in practice not make a difference however.
6185
- // Better solution would be nice but right now that would require disproportionate changes.
6186
- GGML_ASSERT(
6187
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6188
- src1->type == GGML_TYPE_F32 &&
6189
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6190
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6545
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6191
6546
  }
6192
6547
 
6193
6548
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6194
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6195
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6549
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6196
6550
  }
6197
6551
 
6198
6552
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6199
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6200
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6553
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6201
6554
  }
6202
6555
 
6203
6556
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6204
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6205
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6557
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6206
6558
  }
6207
6559
 
6208
6560
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6209
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6210
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6561
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6211
6562
  }
6212
6563
 
6213
6564
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6214
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6215
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6565
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6216
6566
  }
6217
6567
 
6218
6568
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6246,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6246
6596
 
6247
6597
  const int64_t ne12 = src1->ne[2];
6248
6598
 
6249
- CUDA_CHECK(cudaSetDevice(g_main_device));
6250
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6599
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6600
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6251
6601
 
6252
6602
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6253
6603
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6258,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6258
6608
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6259
6609
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6260
6610
 
6261
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6611
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6262
6612
  }
6263
6613
 
6264
6614
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6277,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6277
6627
  const int64_t nb01 = src0->nb[1];
6278
6628
  const int64_t nb02 = src0->nb[2];
6279
6629
 
6280
- CUDA_CHECK(cudaSetDevice(g_main_device));
6281
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6630
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6631
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6282
6632
 
6283
6633
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6284
6634
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6289,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6289
6639
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6290
6640
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6291
6641
 
6292
- const int row_stride_x = nb01 / sizeof(half);
6293
- const int channel_stride_x = nb02 / sizeof(half);
6642
+ const int64_t row_stride_x = nb01 / sizeof(half);
6643
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6294
6644
 
6295
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6645
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6296
6646
  }
6297
6647
 
6298
6648
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6299
6649
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6300
6650
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6301
6651
 
6652
+ int64_t min_compute_capability = INT_MAX;
6653
+ for (int64_t id = 0; id < g_device_count; ++id) {
6654
+ if (min_compute_capability > g_compute_capabilities[id]
6655
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6656
+ min_compute_capability = g_compute_capabilities[id];
6657
+ }
6658
+ }
6659
+
6302
6660
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6303
6661
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6304
6662
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6305
6663
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6306
6664
  }else if (src0->type == GGML_TYPE_F32) {
6307
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6665
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6308
6666
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6309
6667
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6310
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6311
- } else {
6312
- int min_compute_capability = INT_MAX;
6313
- for (int id = 0; id < g_device_count; ++id) {
6314
- if (min_compute_capability > g_compute_capabilities[id]
6315
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6316
- min_compute_capability = g_compute_capabilities[id];
6317
- }
6318
- }
6319
6668
 
6669
+ #ifdef GGML_CUDA_FORCE_DMMV
6670
+ const bool use_mul_mat_vec_q = false;
6671
+ #else
6672
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6673
+ #endif // GGML_CUDA_FORCE_DMMV
6674
+
6675
+ if (use_mul_mat_vec_q) {
6676
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6677
+ } else {
6678
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6679
+ }
6680
+ } else {
6320
6681
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6321
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6682
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6322
6683
  } else {
6323
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6684
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6324
6685
  }
6325
6686
  }
6326
6687
  } else {
@@ -6329,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6329
6690
  }
6330
6691
 
6331
6692
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6332
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6333
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6693
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6334
6694
  }
6335
6695
 
6336
6696
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6359,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6359
6719
  const int64_t nb11 = src1->nb[1];
6360
6720
  const int64_t nb12 = src1->nb[2];
6361
6721
 
6362
- CUDA_CHECK(cudaSetDevice(g_main_device));
6363
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6722
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6723
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6364
6724
 
6365
6725
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6366
6726
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6370,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6370
6730
 
6371
6731
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6372
6732
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6373
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6733
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6374
6734
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6375
6735
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6376
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6736
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6377
6737
  } else {
6378
6738
  GGML_ASSERT(false);
6379
6739
  }
@@ -6387,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6387
6747
  }
6388
6748
 
6389
6749
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6390
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6391
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6750
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6392
6751
  }
6393
6752
 
6394
6753
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6395
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6396
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6754
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6397
6755
  }
6398
6756
 
6399
6757
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6400
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6401
6758
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6402
-
6403
- const int mode = ((int32_t *) dst->op_params)[2];
6404
- const bool is_glm = mode & 4;
6405
-
6406
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6759
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6407
6760
  }
6408
6761
 
6409
6762
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6410
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6411
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6763
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6412
6764
  }
6413
6765
 
6414
6766
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6418,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6418
6770
  }
6419
6771
 
6420
6772
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6421
- int nrows = ggml_nrows(tensor);
6773
+ const int64_t nrows = ggml_nrows(tensor);
6422
6774
 
6423
6775
  const int64_t ne0 = tensor->ne[0];
6424
6776
 
@@ -6428,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6428
6780
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6429
6781
  memset(extra, 0, sizeof(*extra));
6430
6782
 
6431
- for (int id = 0; id < g_device_count; ++id) {
6783
+ for (int64_t id = 0; id < g_device_count; ++id) {
6432
6784
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6433
6785
  continue;
6434
6786
  }
6435
6787
 
6436
- cudaSetDevice(id);
6788
+ ggml_cuda_set_device(id);
6437
6789
 
6438
- int row_low, row_high;
6790
+ int64_t row_low, row_high;
6439
6791
  if (backend == GGML_BACKEND_GPU) {
6440
6792
  row_low = 0;
6441
6793
  row_high = nrows;
@@ -6485,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6485
6837
  extra->data_device[id] = buf;
6486
6838
 
6487
6839
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6488
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6840
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6841
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6842
+ }
6489
6843
  }
6490
6844
  }
6491
6845
 
@@ -6499,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6499
6853
 
6500
6854
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6501
6855
 
6502
- for (int id = 0; id < g_device_count; ++id) {
6856
+ for (int64_t id = 0; id < g_device_count; ++id) {
6503
6857
  if (extra->data_device[id] != nullptr) {
6504
- CUDA_CHECK(cudaSetDevice(id));
6858
+ CUDA_CHECK(ggml_cuda_set_device(id));
6505
6859
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6506
6860
  }
6507
6861
 
6508
- if (extra->events[id] != nullptr) {
6509
- CUDA_CHECK(cudaSetDevice(id));
6510
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6862
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6863
+ if (extra->events[id][is] != nullptr) {
6864
+ CUDA_CHECK(ggml_cuda_set_device(id));
6865
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6866
+ }
6511
6867
  }
6512
6868
  }
6513
6869
 
@@ -6559,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6559
6915
  force_inplace;
6560
6916
  const size_t size = ggml_nbytes(tensor);
6561
6917
 
6562
- CUDA_CHECK(cudaSetDevice(g_main_device));
6918
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6563
6919
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6564
6920
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6565
6921
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];