llama_cpp 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -68,25 +68,52 @@
68
68
  #include <cuda_runtime.h>
69
69
  #include <cublas_v2.h>
70
70
  #include <cuda_fp16.h>
71
- #endif
71
+ #endif // defined(GGML_USE_HIPBLAS)
72
72
 
73
73
  #include "ggml-cuda.h"
74
74
  #include "ggml.h"
75
75
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
76
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #define CC_TURING 700
78
+ #define CC_OFFSET_AMD 1000000
79
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
80
 
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
85
+ defined(__gfx1150__) || defined(__gfx1151__)
86
+ #define RDNA3
87
+ #endif
88
+
89
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
90
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
91
+ #define RDNA2
92
+ #endif
93
+
94
+ #ifndef __has_builtin
95
+ #define __has_builtin(x) 0
96
+ #endif
97
+
84
98
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
99
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
100
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
101
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
102
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
103
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
104
  return reinterpret_cast<const int&>(c);
105
+ #else
106
+ int8x4_t c;
107
+ int16_t tmp;
108
+ #pragma unroll
109
+ for (int i = 0; i < 4; i++) {
110
+ tmp = va[i] - vb[i];
111
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
112
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
113
+ c[i] = tmp;
114
+ }
115
+ return reinterpret_cast<int&>(c);
116
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
117
  }
91
118
 
92
119
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -115,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
115
142
  #endif
116
143
  return c;
117
144
  }
118
- #endif
145
+ #endif // defined(GGML_USE_HIPBLAS)
119
146
 
120
147
  #if defined(_MSC_VER)
121
148
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -127,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
127
154
  do { \
128
155
  cudaError_t err_ = (err); \
129
156
  if (err_ != cudaSuccess) { \
130
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
157
+ int id; \
158
+ cudaGetDevice(&id); \
159
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
131
160
  cudaGetErrorString(err_)); \
161
+ fprintf(stderr, "current device: %d\n", id); \
132
162
  exit(1); \
133
163
  } \
134
164
  } while (0)
@@ -138,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
138
168
  do { \
139
169
  cublasStatus_t err_ = (err); \
140
170
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
171
+ int id; \
172
+ cudaGetDevice(&id); \
141
173
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
142
174
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
175
+ fprintf(stderr, "current device: %d\n", id); \
143
176
  exit(1); \
144
177
  } \
145
178
  } while (0)
@@ -148,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
148
181
  do { \
149
182
  cublasStatus_t err_ = (err); \
150
183
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
184
+ int id; \
185
+ cudaGetDevice(&id); \
151
186
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
187
+ fprintf(stderr, "current device: %d\n", id); \
152
188
  exit(1); \
153
189
  } \
154
190
  } while (0)
@@ -195,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
195
231
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
196
232
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
197
233
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
198
- typedef void (*ggml_cuda_op_t)(
199
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
200
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
201
- cudaStream_t & cudaStream_main);
234
+ typedef void (*ggml_cuda_op_mul_mat_t)(
235
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
236
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
237
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
238
+ typedef void (*ggml_cuda_op_flatten_t)(
239
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
240
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
202
241
 
203
242
  // QK = number of values after dequantization
204
243
  // QR = QK / number of values before dequantization
@@ -379,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
379
418
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
380
419
  #endif
381
420
 
421
+ #define MUL_MAT_SRC1_COL_STRIDE 128
422
+
423
+ #define MAX_STREAMS 8
424
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
425
+
382
426
  struct ggml_tensor_extra_gpu {
383
427
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
384
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
428
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
385
429
  };
386
430
 
431
+ // this is faster on Windows
432
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
433
+ inline cudaError_t ggml_cuda_set_device(const int device) {
434
+ int current_device;
435
+ CUDA_CHECK(cudaGetDevice(&current_device));
436
+
437
+ if (device == current_device) {
438
+ return cudaSuccess;
439
+ }
440
+
441
+ return cudaSetDevice(device);
442
+ }
443
+
387
444
  static int g_device_count = -1;
388
445
  static int g_main_device = 0;
389
446
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -396,8 +453,6 @@ static size_t g_scratch_offset = 0;
396
453
 
397
454
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
398
455
 
399
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
400
-
401
456
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
402
457
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
403
458
 
@@ -447,58 +502,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
502
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
503
  }
449
504
 
505
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
509
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
510
+ }
511
+ return a;
512
+ }
513
+
514
+ template <int block_size>
450
515
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
516
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
517
  const int tid = threadIdx.x;
453
518
 
454
519
  const float eps = 1e-5f;
455
520
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
521
+ float2 mean_var = make_float2(0.f, 0.f);
458
522
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
523
+ for (int col = tid; col < ncols; col += block_size) {
460
524
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
525
+ mean_var.x += xi;
526
+ mean_var.y += xi * xi;
463
527
  }
464
528
 
465
529
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
530
+ mean_var = warp_reduce_sum(mean_var);
531
+ if (block_size > WARP_SIZE) {
532
+ __shared__ float2 s_sum[32];
533
+ int warp_id = threadIdx.x / WARP_SIZE;
534
+ int lane_id = threadIdx.x % WARP_SIZE;
535
+ if (lane_id == 0) {
536
+ s_sum[warp_id] = mean_var;
537
+ }
538
+ __syncthreads();
539
+ mean_var = s_sum[lane_id];
540
+ mean_var = warp_reduce_sum(mean_var);
470
541
  }
471
542
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
543
+ const float mean = mean_var.x / ncols;
544
+ const float var = mean_var.y / ncols - mean * mean;
545
+ const float inv_std = rsqrtf(var + eps);
475
546
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
547
+ for (int col = tid; col < ncols; col += block_size) {
548
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
549
+ }
550
+ }
551
+
552
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
553
+ #pragma unroll
554
+ for (int mask = 16; mask > 0; mask >>= 1) {
555
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
556
  }
557
+ return x;
479
558
  }
480
559
 
560
+ template <int block_size>
481
561
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
562
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
563
  const int tid = threadIdx.x;
484
564
 
485
565
  float tmp = 0.0f; // partial sum for thread in warp
486
566
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
567
+ for (int col = tid; col < ncols; col += block_size) {
488
568
  const float xi = x[row*ncols + col];
489
569
  tmp += xi * xi;
490
570
  }
491
571
 
492
572
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
573
+ tmp = warp_reduce_sum(tmp);
574
+ if (block_size > WARP_SIZE) {
575
+ __shared__ float s_sum[32];
576
+ int warp_id = threadIdx.x / WARP_SIZE;
577
+ int lane_id = threadIdx.x % WARP_SIZE;
578
+ if (lane_id == 0) {
579
+ s_sum[warp_id] = tmp;
580
+ }
581
+ __syncthreads();
582
+ tmp = s_sum[lane_id];
583
+ tmp = warp_reduce_sum(tmp);
496
584
  }
497
585
 
498
586
  const float mean = tmp / ncols;
499
587
  const float scale = rsqrtf(mean + eps);
500
588
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
589
+ for (int col = tid; col < ncols; col += block_size) {
502
590
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
591
  }
504
592
  }
@@ -3394,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
3394
3482
  }
3395
3483
  }
3396
3484
 
3485
+ #define MMQ_X_Q4_0_RDNA2 64
3486
+ #define MMQ_Y_Q4_0_RDNA2 128
3487
+ #define NWARPS_Q4_0_RDNA2 8
3488
+ #define MMQ_X_Q4_0_RDNA1 64
3489
+ #define MMQ_Y_Q4_0_RDNA1 64
3490
+ #define NWARPS_Q4_0_RDNA1 8
3397
3491
  #define MMQ_X_Q4_0_AMPERE 64
3398
3492
  #define MMQ_Y_Q4_0_AMPERE 128
3399
3493
  #define NWARPS_Q4_0_AMPERE 4
@@ -3401,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
3401
3495
  #define MMQ_Y_Q4_0_PASCAL 64
3402
3496
  #define NWARPS_Q4_0_PASCAL 8
3403
3497
 
3404
- template <bool need_check> static __global__ void mul_mat_q4_0(
3498
+ template <bool need_check> static __global__ void
3499
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3500
+ #if defined(RDNA3) || defined(RDNA2)
3501
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3502
+ #endif // defined(RDNA3) || defined(RDNA2)
3503
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3504
+ mul_mat_q4_0(
3405
3505
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3406
3506
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3407
3507
 
3408
- #if __CUDA_ARCH__ >= CC_TURING
3508
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3509
+ #if defined(RDNA3) || defined(RDNA2)
3510
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3511
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3512
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3513
+ #else
3514
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3515
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3516
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3517
+ #endif // defined(RDNA3) || defined(RDNA2)
3518
+
3519
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3520
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3521
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3522
+
3523
+ #elif __CUDA_ARCH__ >= CC_TURING
3409
3524
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3410
3525
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3411
3526
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3428,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3428
3543
  #endif // __CUDA_ARCH__ >= CC_TURING
3429
3544
  }
3430
3545
 
3546
+ #define MMQ_X_Q4_1_RDNA2 64
3547
+ #define MMQ_Y_Q4_1_RDNA2 128
3548
+ #define NWARPS_Q4_1_RDNA2 8
3549
+ #define MMQ_X_Q4_1_RDNA1 64
3550
+ #define MMQ_Y_Q4_1_RDNA1 64
3551
+ #define NWARPS_Q4_1_RDNA1 8
3431
3552
  #define MMQ_X_Q4_1_AMPERE 64
3432
3553
  #define MMQ_Y_Q4_1_AMPERE 128
3433
3554
  #define NWARPS_Q4_1_AMPERE 4
@@ -3436,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3436
3557
  #define NWARPS_Q4_1_PASCAL 8
3437
3558
 
3438
3559
  template <bool need_check> static __global__ void
3439
- #if __CUDA_ARCH__ < CC_TURING
3560
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3561
+ #if defined(RDNA3) || defined(RDNA2)
3562
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3563
+ #endif // defined(RDNA3) || defined(RDNA2)
3564
+ #elif __CUDA_ARCH__ < CC_TURING
3440
3565
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3441
3566
  #endif // __CUDA_ARCH__ < CC_TURING
3442
3567
  mul_mat_q4_1(
3443
3568
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3444
3569
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3445
3570
 
3446
- #if __CUDA_ARCH__ >= CC_TURING
3571
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3572
+ #if defined(RDNA3) || defined(RDNA2)
3573
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3574
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3575
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3576
+ #else
3577
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3578
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3579
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3580
+ #endif // defined(RDNA3) || defined(RDNA2)
3581
+
3582
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3583
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3584
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3585
+
3586
+ #elif __CUDA_ARCH__ >= CC_TURING
3447
3587
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3448
3588
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3449
3589
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3466,6 +3606,12 @@ template <bool need_check> static __global__ void
3466
3606
  #endif // __CUDA_ARCH__ >= CC_TURING
3467
3607
  }
3468
3608
 
3609
+ #define MMQ_X_Q5_0_RDNA2 64
3610
+ #define MMQ_Y_Q5_0_RDNA2 128
3611
+ #define NWARPS_Q5_0_RDNA2 8
3612
+ #define MMQ_X_Q5_0_RDNA1 64
3613
+ #define MMQ_Y_Q5_0_RDNA1 64
3614
+ #define NWARPS_Q5_0_RDNA1 8
3469
3615
  #define MMQ_X_Q5_0_AMPERE 128
3470
3616
  #define MMQ_Y_Q5_0_AMPERE 64
3471
3617
  #define NWARPS_Q5_0_AMPERE 4
@@ -3473,11 +3619,32 @@ template <bool need_check> static __global__ void
3473
3619
  #define MMQ_Y_Q5_0_PASCAL 64
3474
3620
  #define NWARPS_Q5_0_PASCAL 8
3475
3621
 
3476
- template <bool need_check> static __global__ void mul_mat_q5_0(
3622
+ template <bool need_check> static __global__ void
3623
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3624
+ #if defined(RDNA3) || defined(RDNA2)
3625
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3626
+ #endif // defined(RDNA3) || defined(RDNA2)
3627
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3628
+ mul_mat_q5_0(
3477
3629
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3478
3630
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3479
3631
 
3480
- #if __CUDA_ARCH__ >= CC_TURING
3632
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3633
+ #if defined(RDNA3) || defined(RDNA2)
3634
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3635
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3636
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3637
+ #else
3638
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3639
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3640
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3641
+ #endif // defined(RDNA3) || defined(RDNA2)
3642
+
3643
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3644
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3645
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3646
+
3647
+ #elif __CUDA_ARCH__ >= CC_TURING
3481
3648
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3482
3649
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3483
3650
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3500,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3500
3667
  #endif // __CUDA_ARCH__ >= CC_TURING
3501
3668
  }
3502
3669
 
3670
+ #define MMQ_X_Q5_1_RDNA2 64
3671
+ #define MMQ_Y_Q5_1_RDNA2 128
3672
+ #define NWARPS_Q5_1_RDNA2 8
3673
+ #define MMQ_X_Q5_1_RDNA1 64
3674
+ #define MMQ_Y_Q5_1_RDNA1 64
3675
+ #define NWARPS_Q5_1_RDNA1 8
3503
3676
  #define MMQ_X_Q5_1_AMPERE 128
3504
3677
  #define MMQ_Y_Q5_1_AMPERE 64
3505
3678
  #define NWARPS_Q5_1_AMPERE 4
@@ -3507,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3507
3680
  #define MMQ_Y_Q5_1_PASCAL 64
3508
3681
  #define NWARPS_Q5_1_PASCAL 8
3509
3682
 
3510
- template <bool need_check> static __global__ void mul_mat_q5_1(
3683
+ template <bool need_check> static __global__ void
3684
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3685
+ #if defined(RDNA3) || defined(RDNA2)
3686
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3687
+ #endif // defined(RDNA3) || defined(RDNA2)
3688
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3689
+ mul_mat_q5_1(
3511
3690
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3512
3691
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3513
3692
 
3514
- #if __CUDA_ARCH__ >= CC_TURING
3693
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3694
+ #if defined(RDNA3) || defined(RDNA2)
3695
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3696
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3697
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3698
+ #else
3699
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3700
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3701
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3702
+ #endif // defined(RDNA3) || defined(RDNA2)
3703
+
3704
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3705
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3706
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3707
+
3708
+ #elif __CUDA_ARCH__ >= CC_TURING
3515
3709
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3516
3710
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3517
3711
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3534,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3534
3728
  #endif // __CUDA_ARCH__ >= CC_TURING
3535
3729
  }
3536
3730
 
3731
+ #define MMQ_X_Q8_0_RDNA2 64
3732
+ #define MMQ_Y_Q8_0_RDNA2 128
3733
+ #define NWARPS_Q8_0_RDNA2 8
3734
+ #define MMQ_X_Q8_0_RDNA1 64
3735
+ #define MMQ_Y_Q8_0_RDNA1 64
3736
+ #define NWARPS_Q8_0_RDNA1 8
3537
3737
  #define MMQ_X_Q8_0_AMPERE 128
3538
3738
  #define MMQ_Y_Q8_0_AMPERE 64
3539
3739
  #define NWARPS_Q8_0_AMPERE 4
@@ -3541,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3541
3741
  #define MMQ_Y_Q8_0_PASCAL 64
3542
3742
  #define NWARPS_Q8_0_PASCAL 8
3543
3743
 
3544
- template <bool need_check> static __global__ void mul_mat_q8_0(
3744
+ template <bool need_check> static __global__ void
3745
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3746
+ #if defined(RDNA3) || defined(RDNA2)
3747
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3748
+ #endif // defined(RDNA3) || defined(RDNA2)
3749
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3750
+ mul_mat_q8_0(
3545
3751
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3546
3752
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3547
3753
 
3548
- #if __CUDA_ARCH__ >= CC_TURING
3754
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3755
+ #if defined(RDNA3) || defined(RDNA2)
3756
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3757
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3758
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3759
+ #else
3760
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3761
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3762
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3763
+ #endif // defined(RDNA3) || defined(RDNA2)
3764
+
3765
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3766
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3767
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3768
+
3769
+ #elif __CUDA_ARCH__ >= CC_TURING
3549
3770
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3550
3771
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3551
3772
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3568,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3568
3789
  #endif // __CUDA_ARCH__ >= CC_TURING
3569
3790
  }
3570
3791
 
3792
+ #define MMQ_X_Q2_K_RDNA2 64
3793
+ #define MMQ_Y_Q2_K_RDNA2 128
3794
+ #define NWARPS_Q2_K_RDNA2 8
3795
+ #define MMQ_X_Q2_K_RDNA1 128
3796
+ #define MMQ_Y_Q2_K_RDNA1 32
3797
+ #define NWARPS_Q2_K_RDNA1 8
3571
3798
  #define MMQ_X_Q2_K_AMPERE 64
3572
3799
  #define MMQ_Y_Q2_K_AMPERE 128
3573
3800
  #define NWARPS_Q2_K_AMPERE 4
@@ -3575,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3575
3802
  #define MMQ_Y_Q2_K_PASCAL 64
3576
3803
  #define NWARPS_Q2_K_PASCAL 8
3577
3804
 
3578
- template <bool need_check> static __global__ void mul_mat_q2_K(
3805
+ template <bool need_check> static __global__ void
3806
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3807
+ #if defined(RDNA3) || defined(RDNA2)
3808
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3809
+ #endif // defined(RDNA3) || defined(RDNA2)
3810
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3811
+ mul_mat_q2_K(
3579
3812
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3580
3813
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3581
3814
 
3582
- #if __CUDA_ARCH__ >= CC_TURING
3815
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3816
+ #if defined(RDNA3) || defined(RDNA2)
3817
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3818
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3819
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3820
+ #else
3821
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3822
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3823
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3824
+ #endif // defined(RDNA3) || defined(RDNA2)
3825
+
3826
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3827
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3828
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3829
+
3830
+ #elif __CUDA_ARCH__ >= CC_TURING
3583
3831
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3584
3832
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3585
3833
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3602,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3602
3850
  #endif // __CUDA_ARCH__ >= CC_TURING
3603
3851
  }
3604
3852
 
3853
+ #define MMQ_X_Q3_K_RDNA2 128
3854
+ #define MMQ_Y_Q3_K_RDNA2 64
3855
+ #define NWARPS_Q3_K_RDNA2 8
3856
+ #define MMQ_X_Q3_K_RDNA1 32
3857
+ #define MMQ_Y_Q3_K_RDNA1 128
3858
+ #define NWARPS_Q3_K_RDNA1 8
3605
3859
  #define MMQ_X_Q3_K_AMPERE 128
3606
3860
  #define MMQ_Y_Q3_K_AMPERE 128
3607
3861
  #define NWARPS_Q3_K_AMPERE 4
@@ -3610,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3610
3864
  #define NWARPS_Q3_K_PASCAL 8
3611
3865
 
3612
3866
  template <bool need_check> static __global__ void
3613
- #if __CUDA_ARCH__ < CC_TURING
3867
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3868
+ #if defined(RDNA3) || defined(RDNA2)
3869
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3870
+ #endif // defined(RDNA3) || defined(RDNA2)
3871
+ #elif __CUDA_ARCH__ < CC_TURING
3614
3872
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3615
3873
  #endif // __CUDA_ARCH__ < CC_TURING
3616
3874
  mul_mat_q3_K(
3617
3875
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3618
3876
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3619
3877
 
3620
- #if __CUDA_ARCH__ >= CC_TURING
3878
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3879
+ #if defined(RDNA3) || defined(RDNA2)
3880
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3881
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3882
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3883
+ #else
3884
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3885
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3886
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3887
+ #endif // defined(RDNA3) || defined(RDNA2)
3888
+
3889
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3890
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3891
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3892
+
3893
+ #elif __CUDA_ARCH__ >= CC_TURING
3621
3894
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3622
3895
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3623
3896
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3640,6 +3913,12 @@ template <bool need_check> static __global__ void
3640
3913
  #endif // __CUDA_ARCH__ >= CC_TURING
3641
3914
  }
3642
3915
 
3916
+ #define MMQ_X_Q4_K_RDNA2 64
3917
+ #define MMQ_Y_Q4_K_RDNA2 128
3918
+ #define NWARPS_Q4_K_RDNA2 8
3919
+ #define MMQ_X_Q4_K_RDNA1 32
3920
+ #define MMQ_Y_Q4_K_RDNA1 64
3921
+ #define NWARPS_Q4_K_RDNA1 8
3643
3922
  #define MMQ_X_Q4_K_AMPERE 64
3644
3923
  #define MMQ_Y_Q4_K_AMPERE 128
3645
3924
  #define NWARPS_Q4_K_AMPERE 4
@@ -3648,14 +3927,33 @@ template <bool need_check> static __global__ void
3648
3927
  #define NWARPS_Q4_K_PASCAL 8
3649
3928
 
3650
3929
  template <bool need_check> static __global__ void
3651
- #if __CUDA_ARCH__ < CC_TURING
3930
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3931
+ #if defined(RDNA3) || defined(RDNA2)
3932
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3933
+ #endif // defined(RDNA3) || defined(RDNA2)
3934
+ #elif __CUDA_ARCH__ < CC_TURING
3652
3935
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3653
3936
  #endif // __CUDA_ARCH__ < CC_TURING
3654
3937
  mul_mat_q4_K(
3655
3938
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3656
3939
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3657
3940
 
3658
- #if __CUDA_ARCH__ >= CC_TURING
3941
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3942
+ #if defined(RDNA3) || defined(RDNA2)
3943
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3944
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3945
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3946
+ #else
3947
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3948
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3949
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3950
+ #endif // defined(RDNA3) || defined(RDNA2)
3951
+
3952
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3953
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3954
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
+
3956
+ #elif __CUDA_ARCH__ >= CC_TURING
3659
3957
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3660
3958
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3661
3959
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3678,6 +3976,12 @@ template <bool need_check> static __global__ void
3678
3976
  #endif // __CUDA_ARCH__ >= CC_TURING
3679
3977
  }
3680
3978
 
3979
+ #define MMQ_X_Q5_K_RDNA2 64
3980
+ #define MMQ_Y_Q5_K_RDNA2 128
3981
+ #define NWARPS_Q5_K_RDNA2 8
3982
+ #define MMQ_X_Q5_K_RDNA1 32
3983
+ #define MMQ_Y_Q5_K_RDNA1 64
3984
+ #define NWARPS_Q5_K_RDNA1 8
3681
3985
  #define MMQ_X_Q5_K_AMPERE 64
3682
3986
  #define MMQ_Y_Q5_K_AMPERE 128
3683
3987
  #define NWARPS_Q5_K_AMPERE 4
@@ -3685,11 +3989,32 @@ template <bool need_check> static __global__ void
3685
3989
  #define MMQ_Y_Q5_K_PASCAL 64
3686
3990
  #define NWARPS_Q5_K_PASCAL 8
3687
3991
 
3688
- template <bool need_check> static __global__ void mul_mat_q5_K(
3992
+ template <bool need_check> static __global__ void
3993
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3994
+ #if defined(RDNA3) || defined(RDNA2)
3995
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
3996
+ #endif // defined(RDNA3) || defined(RDNA2)
3997
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3998
+ mul_mat_q5_K(
3689
3999
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3690
4000
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3691
4001
 
3692
- #if __CUDA_ARCH__ >= CC_TURING
4002
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4003
+ #if defined(RDNA3) || defined(RDNA2)
4004
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4005
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4006
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4007
+ #else
4008
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4009
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4010
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4011
+ #endif // defined(RDNA3) || defined(RDNA2)
4012
+
4013
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4014
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4015
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4016
+
4017
+ #elif __CUDA_ARCH__ >= CC_TURING
3693
4018
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3694
4019
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3695
4020
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3712,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3712
4037
  #endif // __CUDA_ARCH__ >= CC_TURING
3713
4038
  }
3714
4039
 
4040
+ #define MMQ_X_Q6_K_RDNA2 64
4041
+ #define MMQ_Y_Q6_K_RDNA2 128
4042
+ #define NWARPS_Q6_K_RDNA2 8
4043
+ #define MMQ_X_Q6_K_RDNA1 32
4044
+ #define MMQ_Y_Q6_K_RDNA1 64
4045
+ #define NWARPS_Q6_K_RDNA1 8
3715
4046
  #define MMQ_X_Q6_K_AMPERE 64
3716
4047
  #define MMQ_Y_Q6_K_AMPERE 64
3717
4048
  #define NWARPS_Q6_K_AMPERE 4
@@ -3720,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3720
4051
  #define NWARPS_Q6_K_PASCAL 8
3721
4052
 
3722
4053
  template <bool need_check> static __global__ void
3723
- #if __CUDA_ARCH__ < CC_TURING
4054
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4055
+ #if defined(RDNA3) || defined(RDNA2)
4056
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4057
+ #endif // defined(RDNA3) || defined(RDNA2)
4058
+ #elif __CUDA_ARCH__ < CC_TURING
3724
4059
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3725
4060
  #endif // __CUDA_ARCH__ < CC_TURING
3726
4061
  mul_mat_q6_K(
3727
4062
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3728
4063
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3729
4064
 
3730
- #if __CUDA_ARCH__ >= CC_TURING
4065
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4066
+ #if defined(RDNA3) || defined(RDNA2)
4067
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4068
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4069
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4070
+ #else
4071
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4072
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4073
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4074
+ #endif // defined(RDNA3) || defined(RDNA2)
4075
+
4076
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4077
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4078
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4079
+
4080
+ #elif __CUDA_ARCH__ >= CC_TURING
3731
4081
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3732
4082
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3733
4083
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4036,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4036
4386
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4037
4387
  }
4038
4388
 
4039
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4389
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4040
4391
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4041
4392
  const int half_n_dims = ncols/4;
4042
4393
 
@@ -4048,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4048
4399
  const int i = row*ncols + col;
4049
4400
 
4050
4401
  const float col_theta_scale = powf(theta_scale, col);
4402
+ const float p = p0 + p_delta*(row/p_delta_rows);
4051
4403
 
4052
- const float theta = p*col_theta_scale;
4404
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4053
4405
  const float sin_theta = sinf(theta);
4054
4406
  const float cos_theta = cosf(theta);
4055
4407
 
@@ -4059,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4059
4411
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4060
4412
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4061
4413
 
4062
- const float block_theta = block_p*col_theta_scale;
4414
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4063
4415
  const float sin_block_theta = sinf(block_theta);
4064
4416
  const float cos_block_theta = cosf(block_theta);
4065
4417
 
@@ -4186,14 +4538,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4186
4538
 
4187
4539
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4188
4540
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4541
+ if (ncols < 1024) {
4542
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4543
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4544
+ } else {
4545
+ const dim3 block_dims(1024, 1, 1);
4546
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4547
+ }
4191
4548
  }
4192
4549
 
4193
4550
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4194
4551
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4195
- const dim3 block_dims(WARP_SIZE, 1, 1);
4196
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4552
+ if (ncols < 1024) {
4553
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4554
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4555
+ } else {
4556
+ const dim3 block_dims(1024, 1, 1);
4557
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4558
+ }
4197
4559
  }
4198
4560
 
4199
4561
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -4498,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4498
4860
  const int compute_capability = g_compute_capabilities[id];
4499
4861
 
4500
4862
  int mmq_x, mmq_y, nwarps;
4501
- if (compute_capability >= CC_TURING) {
4863
+ if (compute_capability >= CC_RDNA2) {
4864
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4865
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4866
+ nwarps = NWARPS_Q4_0_RDNA2;
4867
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4868
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4869
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4870
+ nwarps = NWARPS_Q4_0_RDNA1;
4871
+ } else if (compute_capability >= CC_TURING) {
4502
4872
  mmq_x = MMQ_X_Q4_0_AMPERE;
4503
4873
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4504
4874
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4535,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4535
4905
  const int compute_capability = g_compute_capabilities[id];
4536
4906
 
4537
4907
  int mmq_x, mmq_y, nwarps;
4538
- if (compute_capability >= CC_TURING) {
4908
+ if (compute_capability >= CC_RDNA2) {
4909
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4910
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4911
+ nwarps = NWARPS_Q4_1_RDNA2;
4912
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4913
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4914
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4915
+ nwarps = NWARPS_Q4_1_RDNA1;
4916
+ } else if (compute_capability >= CC_TURING) {
4539
4917
  mmq_x = MMQ_X_Q4_1_AMPERE;
4540
4918
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4541
4919
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4572,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4572
4950
  const int compute_capability = g_compute_capabilities[id];
4573
4951
 
4574
4952
  int mmq_x, mmq_y, nwarps;
4575
- if (compute_capability >= CC_TURING) {
4953
+ if (compute_capability >= CC_RDNA2) {
4954
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4955
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4956
+ nwarps = NWARPS_Q5_0_RDNA2;
4957
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4958
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4959
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4960
+ nwarps = NWARPS_Q5_0_RDNA1;
4961
+ } else if (compute_capability >= CC_TURING) {
4576
4962
  mmq_x = MMQ_X_Q5_0_AMPERE;
4577
4963
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4578
4964
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4609,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4609
4995
  const int compute_capability = g_compute_capabilities[id];
4610
4996
 
4611
4997
  int mmq_x, mmq_y, nwarps;
4612
- if (compute_capability >= CC_TURING) {
4998
+ if (compute_capability >= CC_RDNA2) {
4999
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5000
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5001
+ nwarps = NWARPS_Q5_1_RDNA2;
5002
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5003
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5004
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5005
+ nwarps = NWARPS_Q5_1_RDNA1;
5006
+ } else if (compute_capability >= CC_TURING) {
4613
5007
  mmq_x = MMQ_X_Q5_1_AMPERE;
4614
5008
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4615
5009
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4646,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4646
5040
  const int compute_capability = g_compute_capabilities[id];
4647
5041
 
4648
5042
  int mmq_x, mmq_y, nwarps;
4649
- if (compute_capability >= CC_TURING) {
5043
+ if (compute_capability >= CC_RDNA2) {
5044
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5045
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5046
+ nwarps = NWARPS_Q8_0_RDNA2;
5047
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5048
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5049
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5050
+ nwarps = NWARPS_Q8_0_RDNA1;
5051
+ } else if (compute_capability >= CC_TURING) {
4650
5052
  mmq_x = MMQ_X_Q8_0_AMPERE;
4651
5053
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4652
5054
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4683,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4683
5085
  const int compute_capability = g_compute_capabilities[id];
4684
5086
 
4685
5087
  int mmq_x, mmq_y, nwarps;
4686
- if (compute_capability >= CC_TURING) {
5088
+ if (compute_capability >= CC_RDNA2) {
5089
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5090
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5091
+ nwarps = NWARPS_Q2_K_RDNA2;
5092
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5093
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5094
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5095
+ nwarps = NWARPS_Q2_K_RDNA1;
5096
+ } else if (compute_capability >= CC_TURING) {
4687
5097
  mmq_x = MMQ_X_Q2_K_AMPERE;
4688
5098
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4689
5099
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4722,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4722
5132
  const int compute_capability = g_compute_capabilities[id];
4723
5133
 
4724
5134
  int mmq_x, mmq_y, nwarps;
4725
- if (compute_capability >= CC_TURING) {
5135
+ if (compute_capability >= CC_RDNA2) {
5136
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5137
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5138
+ nwarps = NWARPS_Q3_K_RDNA2;
5139
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5140
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5141
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5142
+ nwarps = NWARPS_Q3_K_RDNA1;
5143
+ } else if (compute_capability >= CC_TURING) {
4726
5144
  mmq_x = MMQ_X_Q3_K_AMPERE;
4727
5145
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4728
5146
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4760,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4760
5178
  const int compute_capability = g_compute_capabilities[id];
4761
5179
 
4762
5180
  int mmq_x, mmq_y, nwarps;
4763
- if (compute_capability >= CC_TURING) {
5181
+ if (compute_capability >= CC_RDNA2) {
5182
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5183
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5184
+ nwarps = NWARPS_Q4_K_RDNA2;
5185
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5186
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5187
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5188
+ nwarps = NWARPS_Q4_K_RDNA1;
5189
+ } else if (compute_capability >= CC_TURING) {
4764
5190
  mmq_x = MMQ_X_Q4_K_AMPERE;
4765
5191
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4766
5192
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4797,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4797
5223
  const int compute_capability = g_compute_capabilities[id];
4798
5224
 
4799
5225
  int mmq_x, mmq_y, nwarps;
4800
- if (compute_capability >= CC_TURING) {
5226
+ if (compute_capability >= CC_RDNA2) {
5227
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5228
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5229
+ nwarps = NWARPS_Q5_K_RDNA2;
5230
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5231
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5232
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5233
+ nwarps = NWARPS_Q5_K_RDNA1;
5234
+ } else if (compute_capability >= CC_TURING) {
4801
5235
  mmq_x = MMQ_X_Q5_K_AMPERE;
4802
5236
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4803
5237
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4834,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4834
5268
  const int compute_capability = g_compute_capabilities[id];
4835
5269
 
4836
5270
  int mmq_x, mmq_y, nwarps;
4837
- if (compute_capability >= CC_TURING) {
5271
+ if (compute_capability >= CC_RDNA2) {
5272
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5273
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5274
+ nwarps = NWARPS_Q6_K_RDNA2;
5275
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5276
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5277
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5278
+ nwarps = NWARPS_Q6_K_RDNA1;
5279
+ } else if (compute_capability >= CC_TURING) {
4838
5280
  mmq_x = MMQ_X_Q6_K_AMPERE;
4839
5281
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4840
5282
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4924,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4924
5366
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4925
5367
  }
4926
5368
 
4927
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4928
- GGML_ASSERT(nrows % 4 == 0);
4929
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4930
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5369
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
+ GGML_ASSERT(ncols % 4 == 0);
5372
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4931
5374
  const dim3 block_nums(num_blocks_x, nrows, 1);
4932
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5375
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4933
5376
  }
4934
5377
 
4935
5378
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5067,25 +5510,30 @@ void ggml_init_cublas() {
5067
5510
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5068
5511
  int64_t total_vram = 0;
5069
5512
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5070
- for (int id = 0; id < g_device_count; ++id) {
5513
+ for (int64_t id = 0; id < g_device_count; ++id) {
5071
5514
  cudaDeviceProp prop;
5072
5515
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5073
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5516
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5074
5517
 
5075
5518
  g_tensor_split[id] = total_vram;
5076
5519
  total_vram += prop.totalGlobalMem;
5077
-
5520
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5521
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5522
+ #else
5078
5523
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5524
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5079
5525
  }
5080
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5081
5527
  g_tensor_split[id] /= total_vram;
5082
5528
  }
5083
5529
 
5084
- for (int id = 0; id < g_device_count; ++id) {
5085
- CUDA_CHECK(cudaSetDevice(id));
5530
+ for (int64_t id = 0; id < g_device_count; ++id) {
5531
+ CUDA_CHECK(ggml_cuda_set_device(id));
5086
5532
 
5087
- // create main stream
5088
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5533
+ // create cuda streams
5534
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5535
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5536
+ }
5089
5537
 
5090
5538
  // create cublas handle
5091
5539
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5154,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5154
5602
  if (src->backend == GGML_BACKEND_CPU) {
5155
5603
  kind = cudaMemcpyHostToDevice;
5156
5604
  src_ptr = (char *) src->data;
5157
- } else if (src->backend == GGML_BACKEND_GPU) {
5605
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5606
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5158
5607
  kind = cudaMemcpyDeviceToDevice;
5159
5608
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5160
5609
  int id;
@@ -5193,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5193
5642
  }
5194
5643
 
5195
5644
  inline void ggml_cuda_op_add(
5196
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5197
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5198
- cudaStream_t & cudaStream_main){
5199
-
5200
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5201
- GGML_ASSERT(src1_ddf_i != nullptr);
5202
- GGML_ASSERT(dst_ddf_i != nullptr);
5645
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5646
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5203
5647
 
5204
- const int64_t ne00 = src0->ne[0];
5205
- const int64_t i01_diff = i01_high - i01_low;
5648
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5206
5649
 
5207
5650
  const int64_t ne10 = src1->ne[0];
5208
5651
  const int64_t ne11 = src1->ne[1];
5209
5652
 
5210
- // compute
5211
5653
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5212
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5654
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5213
5655
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5214
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5656
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5215
5657
  } else {
5216
5658
  GGML_ASSERT(false);
5217
5659
  }
5218
5660
 
5219
5661
  (void) src1;
5220
5662
  (void) dst;
5221
- (void) src0_ddq_i;
5222
- (void) i02;
5223
- (void) i1;
5224
5663
  }
5225
5664
 
5226
5665
  inline void ggml_cuda_op_mul(
5227
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5228
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5229
- cudaStream_t & cudaStream_main){
5230
-
5231
- GGML_ASSERT(src0_ddf_i != nullptr);
5232
- GGML_ASSERT(src1_ddf_i != nullptr);
5233
- GGML_ASSERT(dst_ddf_i != nullptr);
5666
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5667
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5234
5668
 
5235
- const int64_t ne00 = src0->ne[0];
5236
- const int64_t i01_diff = i01_high - i01_low;
5669
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5670
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5671
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5237
5672
 
5238
5673
  const int64_t ne10 = src1->ne[0];
5239
5674
  const int64_t ne11 = src1->ne[1];
5240
5675
 
5241
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5676
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5242
5677
 
5243
5678
  (void) dst;
5244
- (void) src0_ddq_i;
5245
- (void) i02;
5246
- (void) i1;
5247
5679
  }
5248
5680
 
5249
5681
  inline void ggml_cuda_op_gelu(
5250
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5251
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5252
- cudaStream_t & cudaStream_main){
5253
-
5254
- GGML_ASSERT(src0_ddf_i != nullptr);
5255
- GGML_ASSERT(dst_ddf_i != nullptr);
5682
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5683
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5256
5684
 
5257
- const int64_t ne00 = src0->ne[0];
5258
- const int64_t i01_diff = i01_high - i01_low;
5685
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5686
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5259
5687
 
5260
- // compute
5261
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5688
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5262
5689
 
5263
5690
  (void) src1;
5264
5691
  (void) dst;
5265
- (void) src0_ddq_i;
5266
- (void) src1_ddf_i;
5267
- (void) i02;
5268
- (void) i1;
5692
+ (void) src1_dd;
5269
5693
  }
5270
5694
 
5271
5695
  inline void ggml_cuda_op_silu(
5272
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5273
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5274
- cudaStream_t & cudaStream_main){
5696
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5697
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5275
5698
 
5276
- GGML_ASSERT(src0_ddf_i != nullptr);
5277
- GGML_ASSERT(dst_ddf_i != nullptr);
5699
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5700
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5278
5701
 
5279
- const int64_t ne00 = src0->ne[0];
5280
- const int64_t i01_diff = i01_high - i01_low;
5281
-
5282
- // compute
5283
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5702
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5284
5703
 
5285
5704
  (void) src1;
5286
5705
  (void) dst;
5287
- (void) src0_ddq_i;
5288
- (void) src1_ddf_i;
5289
- (void) i02;
5290
- (void) i1;
5706
+ (void) src1_dd;
5291
5707
  }
5292
5708
 
5293
5709
  inline void ggml_cuda_op_norm(
5294
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5295
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5296
- cudaStream_t & cudaStream_main){
5710
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5711
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5297
5712
 
5298
- GGML_ASSERT(src0_ddf_i != nullptr);
5299
- GGML_ASSERT(dst_ddf_i != nullptr);
5713
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5714
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5300
5715
 
5301
5716
  const int64_t ne00 = src0->ne[0];
5302
- const int64_t i01_diff = i01_high - i01_low;
5717
+ const int64_t nrows = ggml_nrows(src0);
5303
5718
 
5304
- // compute
5305
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5719
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5306
5720
 
5307
5721
  (void) src1;
5308
5722
  (void) dst;
5309
- (void) src0_ddq_i;
5310
- (void) src1_ddf_i;
5311
- (void) i02;
5312
- (void) i1;
5723
+ (void) src1_dd;
5313
5724
  }
5314
5725
 
5315
5726
  inline void ggml_cuda_op_rms_norm(
5316
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5317
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5318
- cudaStream_t & cudaStream_main){
5727
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5728
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5319
5729
 
5320
- GGML_ASSERT(src0_ddf_i != nullptr);
5321
- GGML_ASSERT(dst_ddf_i != nullptr);
5730
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5731
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5322
5732
 
5323
5733
  const int64_t ne00 = src0->ne[0];
5324
- const int64_t i01_diff = i01_high - i01_low;
5734
+ const int64_t nrows = ggml_nrows(src0);
5325
5735
 
5326
5736
  float eps;
5327
5737
  memcpy(&eps, dst->op_params, sizeof(float));
5328
5738
 
5329
- // compute
5330
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5739
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5331
5740
 
5332
5741
  (void) src1;
5333
5742
  (void) dst;
5334
- (void) src0_ddq_i;
5335
- (void) src1_ddf_i;
5336
- (void) i02;
5337
- (void) i1;
5743
+ (void) src1_dd;
5338
5744
  }
5339
5745
 
5340
5746
  inline void ggml_cuda_op_mul_mat_q(
5341
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5342
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5343
- cudaStream_t & cudaStream_main){
5344
-
5345
- GGML_ASSERT(src0_ddq_i != nullptr);
5346
- GGML_ASSERT(src1_ddf_i != nullptr);
5347
- GGML_ASSERT(dst_ddf_i != nullptr);
5747
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5748
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5749
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5348
5750
 
5349
5751
  const int64_t ne00 = src0->ne[0];
5350
5752
 
5351
5753
  const int64_t ne10 = src1->ne[0];
5352
- const int64_t ne11 = src1->ne[1];
5353
5754
  GGML_ASSERT(ne10 % QK8_1 == 0);
5354
5755
 
5355
5756
  const int64_t ne0 = dst->ne[0];
5356
5757
 
5357
- const int64_t i01_diff = i01_high - i01_low;
5758
+ const int64_t row_diff = row_high - row_low;
5358
5759
 
5359
5760
  int id;
5360
5761
  CUDA_CHECK(cudaGetDevice(&id));
5361
5762
 
5362
5763
  // the main device has a larger memory buffer to hold the results from all GPUs
5363
5764
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5364
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5365
-
5366
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5367
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5368
- size_t as;
5369
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5370
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5765
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5371
5766
 
5372
5767
  switch (src0->type) {
5373
5768
  case GGML_TYPE_Q4_0:
5374
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5769
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5375
5770
  break;
5376
5771
  case GGML_TYPE_Q4_1:
5377
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5772
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5378
5773
  break;
5379
5774
  case GGML_TYPE_Q5_0:
5380
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5775
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5381
5776
  break;
5382
5777
  case GGML_TYPE_Q5_1:
5383
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5778
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5384
5779
  break;
5385
5780
  case GGML_TYPE_Q8_0:
5386
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5781
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5387
5782
  break;
5388
5783
  case GGML_TYPE_Q2_K:
5389
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5784
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5390
5785
  break;
5391
5786
  case GGML_TYPE_Q3_K:
5392
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5787
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5393
5788
  break;
5394
5789
  case GGML_TYPE_Q4_K:
5395
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5790
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5396
5791
  break;
5397
5792
  case GGML_TYPE_Q5_K:
5398
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5793
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5399
5794
  break;
5400
5795
  case GGML_TYPE_Q6_K:
5401
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5796
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5402
5797
  break;
5403
5798
  default:
5404
5799
  GGML_ASSERT(false);
5405
5800
  break;
5406
5801
  }
5407
5802
 
5408
- ggml_cuda_pool_free(src1_q8_1, as);
5409
-
5410
5803
  (void) src1;
5411
5804
  (void) dst;
5412
- (void) src0_ddf_i;
5413
- (void) i02;
5414
- (void) i1;
5805
+ (void) src1_ddf_i;
5415
5806
  }
5416
5807
 
5417
5808
  static int64_t get_row_rounding(ggml_type type) {
5418
- int max_compute_capability = INT_MIN;
5419
- for (int id = 0; id < g_device_count; ++id) {
5420
- if (max_compute_capability < g_compute_capabilities[id]
5421
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5422
- max_compute_capability = g_compute_capabilities[id];
5809
+ int64_t min_compute_capability = INT_MAX;
5810
+ int64_t max_compute_capability = INT_MIN;
5811
+ for (int64_t id = 0; id < g_device_count; ++id) {
5812
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5813
+ if (min_compute_capability > g_compute_capabilities[id]) {
5814
+ min_compute_capability = g_compute_capabilities[id];
5815
+ }
5816
+ if (max_compute_capability < g_compute_capabilities[id]) {
5817
+ max_compute_capability = g_compute_capabilities[id];
5818
+ }
5423
5819
  }
5424
5820
  }
5425
5821
 
5822
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5823
+ switch(type) {
5824
+ case GGML_TYPE_Q4_0:
5825
+ case GGML_TYPE_Q4_1:
5826
+ case GGML_TYPE_Q5_0:
5827
+ case GGML_TYPE_Q5_1:
5828
+ case GGML_TYPE_Q8_0:
5829
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5830
+ case GGML_TYPE_F16:
5831
+ return 1;
5832
+ case GGML_TYPE_Q2_K:
5833
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5834
+ case GGML_TYPE_Q3_K:
5835
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5836
+ case GGML_TYPE_Q4_K:
5837
+ case GGML_TYPE_Q5_K:
5838
+ case GGML_TYPE_Q6_K:
5839
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5840
+ default:
5841
+ GGML_ASSERT(false);
5842
+ }
5843
+ #else
5426
5844
  switch(type) {
5427
5845
  case GGML_TYPE_Q4_0:
5428
5846
  case GGML_TYPE_Q4_1:
@@ -5443,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
5443
5861
  default:
5444
5862
  GGML_ASSERT(false);
5445
5863
  }
5864
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5446
5865
  }
5447
5866
 
5448
- inline void ggml_cuda_op_mul_mat_vec(
5449
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5450
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5451
- cudaStream_t & cudaStream_main){
5452
-
5453
- GGML_ASSERT(src0_ddq_i != nullptr);
5454
- GGML_ASSERT(src1_ddf_i != nullptr);
5455
- GGML_ASSERT(dst_ddf_i != nullptr);
5867
+ inline void ggml_cuda_op_mul_mat_vec_q(
5868
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5869
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5870
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5456
5871
 
5457
5872
  const int64_t ne00 = src0->ne[0];
5458
- const int64_t nrows = i01_high - i01_low;
5873
+ const int64_t row_diff = row_high - row_low;
5459
5874
 
5460
- #ifdef GGML_CUDA_FORCE_DMMV
5461
- const bool use_mul_mat_vec_q = false;
5462
- (void) g_compute_capabilities[0];
5463
- #else
5464
- int id;
5465
- CUDA_CHECK(cudaGetDevice(&id));
5875
+ switch (src0->type) {
5876
+ case GGML_TYPE_Q4_0:
5877
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5878
+ break;
5879
+ case GGML_TYPE_Q4_1:
5880
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5881
+ break;
5882
+ case GGML_TYPE_Q5_0:
5883
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5884
+ break;
5885
+ case GGML_TYPE_Q5_1:
5886
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5887
+ break;
5888
+ case GGML_TYPE_Q8_0:
5889
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5890
+ break;
5891
+ case GGML_TYPE_Q2_K:
5892
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5893
+ break;
5894
+ case GGML_TYPE_Q3_K:
5895
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5896
+ break;
5897
+ case GGML_TYPE_Q4_K:
5898
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5899
+ break;
5900
+ case GGML_TYPE_Q5_K:
5901
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5902
+ break;
5903
+ case GGML_TYPE_Q6_K:
5904
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5905
+ break;
5906
+ default:
5907
+ GGML_ASSERT(false);
5908
+ break;
5909
+ }
5466
5910
 
5467
- bool mul_mat_vec_q_implemented =
5468
- src0->type == GGML_TYPE_Q4_0 ||
5469
- src0->type == GGML_TYPE_Q4_1 ||
5470
- src0->type == GGML_TYPE_Q5_0 ||
5471
- src0->type == GGML_TYPE_Q5_1 ||
5472
- src0->type == GGML_TYPE_Q8_0;
5473
- #if QK_K == 256
5474
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5475
- src0->type == GGML_TYPE_Q2_K ||
5476
- src0->type == GGML_TYPE_Q3_K ||
5477
- src0->type == GGML_TYPE_Q4_K ||
5478
- src0->type == GGML_TYPE_Q5_K ||
5479
- src0->type == GGML_TYPE_Q6_K;
5480
- #endif // QK_K == 256
5481
-
5482
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5483
- #endif
5911
+ (void) src1;
5912
+ (void) dst;
5913
+ (void) src1_ddf_i;
5914
+ (void) src1_ncols;
5915
+ (void) src1_padded_row_size;
5916
+ }
5484
5917
 
5485
- if (use_mul_mat_vec_q) {
5486
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5487
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5488
- size_t as;
5489
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5490
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5491
-
5492
- switch (src0->type) {
5493
- case GGML_TYPE_Q4_0:
5494
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5495
- break;
5496
- case GGML_TYPE_Q4_1:
5497
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5498
- break;
5499
- case GGML_TYPE_Q5_0:
5500
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5501
- break;
5502
- case GGML_TYPE_Q5_1:
5503
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5504
- break;
5505
- case GGML_TYPE_Q8_0:
5506
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5507
- break;
5508
- case GGML_TYPE_Q2_K:
5509
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5510
- break;
5511
- case GGML_TYPE_Q3_K:
5512
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5513
- break;
5514
- case GGML_TYPE_Q4_K:
5515
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5516
- break;
5517
- case GGML_TYPE_Q5_K:
5518
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5519
- break;
5520
- case GGML_TYPE_Q6_K:
5521
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5522
- break;
5523
- default:
5524
- GGML_ASSERT(false);
5525
- break;
5526
- }
5918
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5919
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5920
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5921
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5527
5922
 
5528
- ggml_cuda_pool_free(src1_q8_1, as);
5529
- } else {
5530
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5923
+ const int64_t ne00 = src0->ne[0];
5924
+ const int64_t row_diff = row_high - row_low;
5925
+
5926
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5531
5927
  #ifdef GGML_CUDA_F16
5532
- size_t ash;
5533
- dfloat * src1_dfloat = nullptr; // dfloat == half
5534
-
5535
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5536
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5537
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5538
-
5539
- if (src1_convert_f16) {
5540
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5541
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5542
- ne00, 1, sizeof(float), 0, 0,
5543
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5544
- }
5928
+ size_t ash;
5929
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5930
+
5931
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5932
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5933
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5934
+
5935
+ if (src1_convert_f16) {
5936
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5937
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5938
+ ne00, 1, sizeof(float), 0, 0,
5939
+ ne00, 1, sizeof(half), 0, 0, stream);
5940
+ }
5545
5941
  #else
5546
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5942
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5547
5943
  #endif // GGML_CUDA_F16
5548
5944
 
5549
- switch (src0->type) {
5550
- case GGML_TYPE_Q4_0:
5551
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5552
- break;
5553
- case GGML_TYPE_Q4_1:
5554
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q5_0:
5557
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_1:
5560
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q8_0:
5563
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q2_K:
5566
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q3_K:
5569
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q4_K:
5572
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q5_K:
5575
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q6_K:
5578
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_F16:
5581
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5945
+ switch (src0->type) {
5946
+ case GGML_TYPE_Q4_0:
5947
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5948
+ break;
5949
+ case GGML_TYPE_Q4_1:
5950
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5951
+ break;
5952
+ case GGML_TYPE_Q5_0:
5953
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5954
+ break;
5955
+ case GGML_TYPE_Q5_1:
5956
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5957
+ break;
5958
+ case GGML_TYPE_Q8_0:
5959
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5960
+ break;
5961
+ case GGML_TYPE_Q2_K:
5962
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5963
+ break;
5964
+ case GGML_TYPE_Q3_K:
5965
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5966
+ break;
5967
+ case GGML_TYPE_Q4_K:
5968
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5969
+ break;
5970
+ case GGML_TYPE_Q5_K:
5971
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5972
+ break;
5973
+ case GGML_TYPE_Q6_K:
5974
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5975
+ break;
5976
+ case GGML_TYPE_F16:
5977
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5978
+ break;
5979
+ default:
5980
+ GGML_ASSERT(false);
5981
+ break;
5982
+ }
5587
5983
 
5588
5984
  #ifdef GGML_CUDA_F16
5589
- if (src1_convert_f16) {
5590
- ggml_cuda_pool_free(src1_dfloat, ash);
5591
- }
5592
- #endif // GGML_CUDA_F16
5985
+ if (src1_convert_f16) {
5986
+ ggml_cuda_pool_free(src1_dfloat, ash);
5593
5987
  }
5988
+ #endif // GGML_CUDA_F16
5594
5989
 
5595
5990
  (void) src1;
5596
5991
  (void) dst;
5597
- (void) src0_ddf_i;
5598
- (void) i02;
5599
- (void) i1;
5992
+ (void) src1_ddq_i;
5993
+ (void) src1_ncols;
5994
+ (void) src1_padded_row_size;
5600
5995
  }
5601
5996
 
5602
5997
  inline void ggml_cuda_op_mul_mat_cublas(
5603
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5604
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5605
- cudaStream_t & cudaStream_main){
5998
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5999
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6000
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5606
6001
 
5607
- GGML_ASSERT(src0_ddf_i != nullptr);
6002
+ GGML_ASSERT(src0_dd_i != nullptr);
5608
6003
  GGML_ASSERT(src1_ddf_i != nullptr);
5609
- GGML_ASSERT(dst_ddf_i != nullptr);
6004
+ GGML_ASSERT(dst_dd_i != nullptr);
5610
6005
 
5611
6006
  const float alpha = 1.0f;
5612
6007
  const float beta = 0.0f;
@@ -5614,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5614
6009
  const int64_t ne00 = src0->ne[0];
5615
6010
 
5616
6011
  const int64_t ne10 = src1->ne[0];
5617
- const int64_t ne11 = src1->ne[1];
5618
6012
 
5619
6013
  const int64_t ne0 = dst->ne[0];
5620
- const int64_t i01_diff = i01_high - i01_low;
6014
+ const int64_t row_diff = row_high - row_low;
6015
+
6016
+ float * src0_ddq_as_f32;
6017
+ size_t src0_as = 0;
6018
+
6019
+ if (src0->type != GGML_TYPE_F32) {
6020
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
+ }
6024
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5621
6025
 
5622
6026
  int id;
5623
6027
  CUDA_CHECK(cudaGetDevice(&id));
5624
6028
 
5625
6029
  // the main device has a larger memory buffer to hold the results from all GPUs
5626
6030
  // ldc == nrows of the matrix that cuBLAS writes into
5627
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6031
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5628
6032
 
5629
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6033
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5630
6034
  CUBLAS_CHECK(
5631
6035
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5632
- i01_diff, ne11, ne10,
6036
+ row_diff, src1_ncols, ne10,
5633
6037
  &alpha, src0_ddf_i, ne00,
5634
- src1_ddf_i, ne10,
5635
- &beta, dst_ddf_i, ldc));
6038
+ src1_ddf_i, ne10,
6039
+ &beta, dst_dd_i, ldc));
6040
+
6041
+ if (src0_as > 0) {
6042
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6043
+ }
5636
6044
 
5637
6045
  (void) dst;
5638
- (void) src0_ddq_i;
5639
- (void) i02;
5640
- (void) i1;
6046
+ (void) src1_ddq_i;
6047
+ (void) src1_padded_row_size;
5641
6048
  }
5642
6049
 
5643
6050
  inline void ggml_cuda_op_rope(
5644
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5645
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5646
- cudaStream_t & cudaStream_main){
6051
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5647
6053
 
5648
- GGML_ASSERT(src0_ddf_i != nullptr);
5649
- GGML_ASSERT(dst_ddf_i != nullptr);
6054
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5650
6056
 
5651
6057
  const int64_t ne00 = src0->ne[0];
5652
6058
  const int64_t ne01 = src0->ne[1];
5653
- const int64_t i01_diff = i01_high - i01_low;
6059
+ const int64_t nrows = ggml_nrows(src0);
5654
6060
 
5655
6061
  const int n_past = ((int32_t *) dst->op_params)[0];
5656
6062
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5663,44 +6069,37 @@ inline void ggml_cuda_op_rope(
5663
6069
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5664
6070
 
5665
6071
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5666
6073
 
5667
6074
  const bool is_neox = mode & 2;
5668
6075
  const bool is_glm = mode & 4;
5669
6076
 
5670
6077
  // compute
5671
6078
  if (is_glm) {
5672
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5673
- const float id_p = min(p, n_ctx - 2.f);
5674
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5675
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6079
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5676
6080
  } else if (is_neox) {
5677
6081
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5678
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5679
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6082
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5680
6083
  } else {
5681
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5682
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6084
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5683
6085
  }
5684
6086
 
5685
6087
  (void) src1;
5686
6088
  (void) dst;
5687
- (void) src0_ddq_i;
5688
- (void) src1_ddf_i;
5689
- (void) i1;
6089
+ (void) src1_dd;
5690
6090
  }
5691
6091
 
5692
6092
  inline void ggml_cuda_op_alibi(
5693
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5694
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5695
- cudaStream_t & cudaStream_main){
6093
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6094
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5696
6095
 
5697
- GGML_ASSERT(src0_ddf_i != nullptr);
5698
- GGML_ASSERT(dst_ddf_i != nullptr);
6096
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6097
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5699
6098
 
5700
6099
  const int64_t ne00 = src0->ne[0];
5701
6100
  const int64_t ne01 = src0->ne[1];
5702
6101
  const int64_t ne02 = src0->ne[2];
5703
- const int64_t i01_diff = i01_high - i01_low;
6102
+ const int64_t nrows = ggml_nrows(src0);
5704
6103
 
5705
6104
  const int n_past = ((int32_t *) dst->op_params)[0];
5706
6105
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5715,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
5715
6114
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5716
6115
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5717
6116
 
5718
- // compute
5719
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6117
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5720
6118
 
5721
6119
  (void) src1;
5722
- (void) src0_ddq_i;
5723
- (void) src1_ddf_i;
5724
- (void) i1;
6120
+ (void) src1_dd;
5725
6121
  }
5726
6122
 
5727
6123
  inline void ggml_cuda_op_diag_mask_inf(
5728
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5729
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5730
- cudaStream_t & cudaStream_main){
6124
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6125
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5731
6126
 
5732
- GGML_ASSERT(src0_ddf_i != nullptr);
5733
- GGML_ASSERT(dst_ddf_i != nullptr);
6127
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6128
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5734
6129
 
5735
6130
  const int64_t ne00 = src0->ne[0];
5736
6131
  const int64_t ne01 = src0->ne[1];
5737
- const int64_t i01_diff = i01_high - i01_low;
6132
+ const int nrows0 = ggml_nrows(src0);
5738
6133
 
5739
6134
  const int n_past = ((int32_t *) dst->op_params)[0];
5740
6135
 
5741
- // compute
5742
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6136
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5743
6137
 
5744
6138
  (void) src1;
5745
6139
  (void) dst;
5746
- (void) src0_ddq_i;
5747
- (void) src1_ddf_i;
5748
- (void) i02;
5749
- (void) i1;
6140
+ (void) src1_dd;
5750
6141
  }
5751
6142
 
5752
6143
  inline void ggml_cuda_op_soft_max(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6144
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6145
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6146
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6147
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6148
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6149
 
5760
6150
  const int64_t ne00 = src0->ne[0];
5761
- const int64_t i01_diff = i01_high - i01_low;
6151
+ const int64_t nrows = ggml_nrows(src0);
5762
6152
 
5763
- // compute
5764
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6153
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5765
6154
 
5766
6155
  (void) src1;
5767
6156
  (void) dst;
5768
- (void) src0_ddq_i;
5769
- (void) src1_ddf_i;
5770
- (void) i02;
5771
- (void) i1;
6157
+ (void) src1_dd;
5772
6158
  }
5773
6159
 
5774
6160
  inline void ggml_cuda_op_scale(
5775
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5776
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5777
- cudaStream_t & cudaStream_main){
6161
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6162
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5778
6163
 
5779
- GGML_ASSERT(src0_ddf_i != nullptr);
5780
- GGML_ASSERT(dst_ddf_i != nullptr);
6164
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6165
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6166
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5781
6167
 
5782
6168
  const float scale = ((float *) src1->data)[0];
5783
6169
 
5784
- const int64_t ne00 = src0->ne[0];
5785
- const int64_t i01_diff = i01_high - i01_low;
5786
-
5787
- // compute
5788
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6170
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5789
6171
  CUDA_CHECK(cudaGetLastError());
5790
6172
 
5791
6173
  (void) src1;
5792
6174
  (void) dst;
5793
- (void) src0_ddq_i;
5794
- (void) src1_ddf_i;
5795
- (void) i02;
5796
- (void) i1;
6175
+ (void) src1_dd;
6176
+ }
6177
+
6178
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6179
+ const int64_t nrows0 = ggml_nrows(src0);
6180
+
6181
+ const bool use_src1 = src1 != nullptr;
6182
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6183
+
6184
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6185
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6186
+
6187
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6188
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6189
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6190
+
6191
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6192
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6193
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6194
+
6195
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6196
+
6197
+ // dd = data device
6198
+ float * src0_ddf = nullptr;
6199
+ float * src1_ddf = nullptr;
6200
+ float * dst_ddf = nullptr;
6201
+
6202
+ // as = actual size
6203
+ size_t src0_asf = 0;
6204
+ size_t src1_asf = 0;
6205
+ size_t dst_asf = 0;
6206
+
6207
+ ggml_cuda_set_device(g_main_device);
6208
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6209
+
6210
+ if (src0_on_device) {
6211
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6212
+ } else {
6213
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6215
+ }
6216
+
6217
+ if (use_src1 && !src1_stays_on_host) {
6218
+ if (src1_on_device) {
6219
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6220
+ } else {
6221
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6222
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6223
+ }
6224
+ }
6225
+ if (dst_on_device) {
6226
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6227
+ } else {
6228
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6229
+ }
6230
+
6231
+ // do the computation
6232
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6233
+ CUDA_CHECK(cudaGetLastError());
6234
+
6235
+ // copy dst to host if necessary
6236
+ if (!dst_on_device) {
6237
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6238
+ }
6239
+
6240
+ if (src0_asf > 0) {
6241
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6242
+ }
6243
+ if (src1_asf > 0) {
6244
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6245
+ }
6246
+ if (dst_asf > 0) {
6247
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6248
+ }
6249
+
6250
+ if (dst->backend == GGML_BACKEND_CPU) {
6251
+ CUDA_CHECK(cudaDeviceSynchronize());
6252
+ }
5797
6253
  }
5798
6254
 
5799
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5800
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6255
+ static void ggml_cuda_op_mul_mat(
6256
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
+ const bool convert_src1_to_q8_1) {
6258
+
5801
6259
  const int64_t ne00 = src0->ne[0];
5802
6260
  const int64_t ne01 = src0->ne[1];
5803
6261
  const int64_t ne02 = src0->ne[2];
5804
6262
  const int64_t ne03 = src0->ne[3];
5805
6263
  const int64_t nrows0 = ggml_nrows(src0);
5806
6264
 
5807
- const bool use_src1 = src1 != nullptr;
5808
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5809
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5810
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5811
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5812
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6265
+ const int64_t ne10 = src1->ne[0];
6266
+ const int64_t ne11 = src1->ne[1];
6267
+ const int64_t ne12 = src1->ne[2];
6268
+ const int64_t ne13 = src1->ne[3];
6269
+ const int64_t nrows1 = ggml_nrows(src1);
5813
6270
 
5814
6271
  GGML_ASSERT(ne03 == ne13);
5815
6272
 
5816
6273
  const int64_t ne0 = dst->ne[0];
5817
6274
  const int64_t ne1 = dst->ne[1];
5818
6275
 
5819
- const int nb2 = dst->nb[2];
5820
- const int nb3 = dst->nb[3];
6276
+ const int nb2 = dst->nb[2];
6277
+ const int nb3 = dst->nb[3];
5821
6278
 
5822
6279
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5823
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6280
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5824
6281
 
5825
- // strides for iteration over dims 3 and 2
5826
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5827
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5828
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5829
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5830
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5831
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6282
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5832
6283
 
5833
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5834
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5835
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5836
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5837
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6284
+ const int64_t i02_divisor = ne12 / ne02;
5838
6285
 
5839
6286
  const size_t src0_ts = ggml_type_size(src0->type);
5840
6287
  const size_t src0_bs = ggml_blck_size(src0->type);
6288
+ const size_t q8_1_ts = sizeof(block_q8_1);
6289
+ const size_t q8_1_bs = QK8_1;
5841
6290
 
5842
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5843
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5844
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6291
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6292
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6293
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5845
6294
 
5846
6295
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5847
6296
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5848
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5849
6297
 
5850
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5851
- const bool src1_stays_on_host = use_src1 && (
5852
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6298
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6299
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6300
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5853
6301
 
5854
6302
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6303
+ GGML_ASSERT(!(split && ne02 > 1));
6304
+ GGML_ASSERT(!(split && ne03 > 1));
5855
6305
  GGML_ASSERT(!(split && ne02 < ne12));
5856
6306
 
5857
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5858
-
5859
6307
  // dd = data device
5860
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5861
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5862
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5863
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5864
-
5865
- // asq = actual size quantized, asf = actual size float
5866
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5867
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
6308
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6309
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6310
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6311
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6312
+
6313
+ // as = actual size
6314
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
5868
6315
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5869
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6316
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6317
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5870
6318
 
5871
- // if multiple devices are used they need to wait for the main device
5872
- // here an event is recorded that signifies that the main device has finished calculating the input data
5873
- if (split && g_device_count > 1) {
5874
- CUDA_CHECK(cudaSetDevice(g_main_device));
5875
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5876
- }
6319
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6320
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5877
6321
 
5878
- for (int id = 0; id < g_device_count; ++id) {
5879
- if (!split && id != g_main_device) {
5880
- continue;
5881
- }
5882
-
5883
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5884
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6322
+ for (int64_t id = 0; id < g_device_count; ++id) {
6323
+ // by default, use all rows
6324
+ row_low[id] = 0;
6325
+ row_high[id] = ne01;
5885
6326
 
5886
- int64_t row_low, row_high;
6327
+ // for multi GPU, get the row boundaries from tensor split
6328
+ // and round to mul_mat_q tile sizes
5887
6329
  if (split) {
5888
6330
  const int64_t rounding = get_row_rounding(src0->type);
5889
6331
 
5890
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5891
- row_low -= row_low % rounding;
6332
+ if (id != 0) {
6333
+ row_low[id] = ne01*g_tensor_split[id];
6334
+ row_low[id] -= row_low[id] % rounding;
6335
+ }
5892
6336
 
5893
- if (id == g_device_count - 1) {
5894
- row_high = nrows0;
5895
- } else {
5896
- row_high = nrows0*g_tensor_split[id + 1];
5897
- row_high -= row_high % rounding;
6337
+ if (id != g_device_count - 1) {
6338
+ row_high[id] = ne01*g_tensor_split[id + 1];
6339
+ row_high[id] -= row_high[id] % rounding;
5898
6340
  }
5899
- } else {
5900
- row_low = 0;
5901
- row_high = nrows0*i02_divisor;
5902
6341
  }
5903
- if (row_low == row_high) {
6342
+ }
6343
+
6344
+ for (int64_t id = 0; id < g_device_count; ++id) {
6345
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5904
6346
  continue;
5905
6347
  }
5906
6348
 
5907
- int64_t row_diff = row_high - row_low;
5908
-
5909
- cudaSetDevice(id);
5910
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6349
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6350
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5911
6351
 
5912
- // wait for main GPU data if necessary
5913
- if (split && id != g_main_device) {
5914
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5915
- }
6352
+ ggml_cuda_set_device(id);
6353
+ const cudaStream_t stream = g_cudaStreams[id][0];
5916
6354
 
5917
6355
  if (src0_on_device && src0_is_contiguous) {
5918
- if (src0_is_f32) {
5919
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5920
- } else {
5921
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5922
- }
6356
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5923
6357
  } else {
5924
- if (src0_is_f32) {
5925
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5926
- } else {
5927
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5928
- }
6358
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6359
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5929
6360
  }
5930
6361
 
5931
- if (src0_needs_f32 && !src0_is_f32) {
5932
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6362
+ if (src1_on_device && src1_is_contiguous) {
6363
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6364
+ } else {
6365
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5933
6366
  }
5934
6367
 
5935
- if (use_src1 && !src1_stays_on_host) {
5936
- if (src1_on_device && src1_is_contiguous) {
5937
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5938
- } else {
5939
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6368
+ if (convert_src1_to_q8_1) {
6369
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6370
+
6371
+ if (split && src1_on_device && src1_is_contiguous) {
6372
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6373
+ CUDA_CHECK(cudaGetLastError());
5940
6374
  }
5941
6375
  }
6376
+
5942
6377
  if (dst_on_device) {
5943
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6378
+ dst_dd[id] = (float *) dst_extra->data_device[id];
5944
6379
  } else {
5945
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
5946
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6380
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6381
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
5947
6382
  }
6383
+ }
5948
6384
 
5949
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
5950
- const int64_t i13 = i03 % ne13;
5951
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
5952
- const int64_t i12 = i02 % ne12;
6385
+ // if multiple devices are used they need to wait for the main device
6386
+ // here an event is recorded that signals that the main device has finished calculating the input data
6387
+ if (split && g_device_count > 1) {
6388
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6389
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6390
+ }
5953
6391
 
5954
- const int64_t i0 = i03*i02_max + i02;
6392
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6393
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6394
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6395
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
5955
6396
 
5956
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
5957
- const int64_t i0_offset_low = row_low/rows_per_iter;
5958
- const int64_t i0_offset_high = row_high/rows_per_iter;
6397
+ for (int64_t id = 0; id < g_device_count; ++id) {
6398
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6399
+ continue;
6400
+ }
5959
6401
 
5960
- int64_t i01_low = 0;
5961
- int64_t i01_high = rows_per_iter;
5962
- if (split) {
5963
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
5964
- continue;
5965
- }
5966
- if (i0 == i0_offset_low) {
5967
- i01_low = row_low % rows_per_iter;
5968
- }
5969
- if (i0 == i0_offset_high) {
5970
- i01_high = row_high % rows_per_iter;
5971
- }
5972
- }
6402
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6403
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6404
+ const int64_t row_diff = row_high[id] - row_low[id];
5973
6405
 
5974
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
5975
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
5976
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
5977
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
5978
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
5979
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6406
+ ggml_cuda_set_device(id);
6407
+ const cudaStream_t stream = g_cudaStreams[id][is];
5980
6408
 
5981
- const int64_t i01_diff = i01_high - i01_low;
5982
- if (i01_diff == 0) {
5983
- continue;
5984
- }
5985
- const int64_t i11 = i13*ne12 + i12;
6409
+ // wait for main GPU data if necessary
6410
+ if (split && (id != g_main_device || is != 0)) {
6411
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6412
+ }
6413
+
6414
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6415
+ const int64_t i03 = i0 / ne12;
6416
+ const int64_t i02 = i0 % ne12;
6417
+
6418
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
5986
6419
 
5987
6420
  // for split tensors the data begins at i0 == i0_offset_low
5988
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
5989
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
5990
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
5991
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
5992
-
5993
- // for split tensors the data pointer needs to be rounded down
5994
- // to the bin edge for i03, i02 bins beyond the first
5995
- if (i0 - i0_offset_low > 0) {
5996
- GGML_ASSERT(!flatten_rows);
5997
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
5998
- src0_ddf_i -= (row_low % ne01)*ne00;
5999
- dst_ddf_i -= (row_low % ne0)*ne1;
6000
- }
6421
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6422
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6423
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6424
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6001
6425
 
6002
6426
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6003
6427
  // in that case an offset on dst_ddf_i is needed
6004
6428
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6005
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6429
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6006
6430
  }
6007
6431
 
6008
6432
  // copy src0, src1 to device if necessary
6009
- if (use_src1 && !src1_stays_on_host) {
6010
- if (src1->backend == GGML_BACKEND_CPU) {
6011
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6012
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6013
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6014
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6015
- if (id != g_main_device) {
6016
- GGML_ASSERT(!flatten_rows);
6433
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6434
+ if (id != g_main_device) {
6435
+ if (convert_src1_to_q8_1) {
6436
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6437
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6438
+ cudaMemcpyDeviceToDevice, stream));
6439
+ } else {
6017
6440
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6018
- src1_ddf_i_source += i11*src1_stride;
6019
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6020
- cudaMemcpyDeviceToDevice, cudaStream_main));
6441
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6442
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6443
+ cudaMemcpyDeviceToDevice, stream));
6021
6444
  }
6022
- } else if (src1_on_device && !src1_is_contiguous) {
6023
- GGML_ASSERT(!split);
6024
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6025
- } else {
6026
- GGML_ASSERT(false);
6027
6445
  }
6446
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6447
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6448
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6449
+ } else {
6450
+ GGML_ASSERT(false);
6028
6451
  }
6029
6452
 
6030
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6031
- if (src0_is_f32) {
6032
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6033
- } else {
6034
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6035
- }
6453
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6454
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6455
+ CUDA_CHECK(cudaGetLastError());
6036
6456
  }
6037
6457
 
6038
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6039
- if (src0_needs_f32 && !src0_is_f32) {
6040
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6041
- CUDA_CHECK(cudaGetLastError());
6458
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6459
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6042
6460
  }
6043
6461
 
6044
6462
  // do the computation
6045
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6463
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6464
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6046
6465
  CUDA_CHECK(cudaGetLastError());
6047
6466
 
6048
6467
  // copy dst to host or other device if necessary
@@ -6064,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6064
6483
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6065
6484
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6066
6485
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6067
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6068
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6069
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6486
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6487
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6488
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6489
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6490
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6070
6491
  } else {
6071
6492
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6072
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6493
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6494
+ dhf_dst_i += src1_col_0*ne0;
6495
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6073
6496
  }
6074
6497
  }
6075
6498
 
6076
- // signify to main device that other device is done
6077
- if (split && g_device_count > 1 && id != g_main_device) {
6078
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6499
+ // add event for the main device to wait on until other device is done
6500
+ if (split && (id != g_main_device || is != 0)) {
6501
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6079
6502
  }
6080
6503
  }
6081
6504
  }
6082
6505
  }
6083
6506
 
6084
- // wait until each device is finished, then free their buffers
6085
- for (int id = 0; id < g_device_count; ++id) {
6086
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6087
- continue;
6088
- }
6089
-
6090
- CUDA_CHECK(cudaSetDevice(id));
6507
+ for (int64_t id = 0; id < g_device_count; ++id) {
6508
+ CUDA_CHECK(ggml_cuda_set_device(id));
6091
6509
 
6092
- if (src0_asq[id] > 0) {
6093
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6094
- }
6095
- if (src0_asf[id] > 0) {
6096
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6510
+ // free buffers again when done
6511
+ if (src0_as[id] > 0) {
6512
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6097
6513
  }
6098
6514
  if (src1_asf[id] > 0) {
6099
6515
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6100
6516
  }
6101
- if (dst_asf[id] > 0) {
6102
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6517
+ if (src1_asq[id] > 0) {
6518
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6519
+ }
6520
+ if (dst_as[id] > 0) {
6521
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6103
6522
  }
6104
6523
  }
6105
6524
 
6106
6525
  // main device waits for all other devices to be finished
6107
6526
  if (split && g_device_count > 1) {
6108
- CUDA_CHECK(cudaSetDevice(g_main_device));
6109
- for (int id = 0; id < g_device_count; ++id) {
6110
- if (id != g_main_device && src0_extra->events[id]) {
6111
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6527
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6528
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6529
+
6530
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
+ for (int64_t id = 0; id < g_device_count; ++id) {
6532
+ for (int64_t is = 0; is < is_max; ++is) {
6533
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6112
6534
  }
6113
6535
  }
6114
6536
  }
6115
6537
 
6116
6538
  if (dst->backend == GGML_BACKEND_CPU) {
6117
- CUDA_CHECK(cudaSetDevice(g_main_device));
6539
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6118
6540
  CUDA_CHECK(cudaDeviceSynchronize());
6119
6541
  }
6120
6542
  }
6121
6543
 
6122
6544
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6123
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6124
- // Due to flatten_rows == true this does in practice not make a difference however.
6125
- // Better solution would be nice but right now that would require disproportionate changes.
6126
- GGML_ASSERT(
6127
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6128
- src1->type == GGML_TYPE_F32 &&
6129
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6130
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6545
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6131
6546
  }
6132
6547
 
6133
6548
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6134
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6135
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6549
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6136
6550
  }
6137
6551
 
6138
6552
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6139
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6140
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6553
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6141
6554
  }
6142
6555
 
6143
6556
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6144
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6145
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6557
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6146
6558
  }
6147
6559
 
6148
6560
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6149
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6150
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6561
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6151
6562
  }
6152
6563
 
6153
6564
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6154
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6155
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6565
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6156
6566
  }
6157
6567
 
6158
6568
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6186,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6186
6596
 
6187
6597
  const int64_t ne12 = src1->ne[2];
6188
6598
 
6189
- CUDA_CHECK(cudaSetDevice(g_main_device));
6190
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6599
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6600
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6191
6601
 
6192
6602
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6193
6603
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6198,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6198
6608
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6199
6609
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6200
6610
 
6201
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6611
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6202
6612
  }
6203
6613
 
6204
6614
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6217,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6217
6627
  const int64_t nb01 = src0->nb[1];
6218
6628
  const int64_t nb02 = src0->nb[2];
6219
6629
 
6220
- CUDA_CHECK(cudaSetDevice(g_main_device));
6221
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6630
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6631
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6222
6632
 
6223
6633
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6224
6634
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6229,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6229
6639
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6230
6640
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6231
6641
 
6232
- const int row_stride_x = nb01 / sizeof(half);
6233
- const int channel_stride_x = nb02 / sizeof(half);
6642
+ const int64_t row_stride_x = nb01 / sizeof(half);
6643
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6234
6644
 
6235
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6645
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6236
6646
  }
6237
6647
 
6238
6648
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6239
6649
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6240
6650
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6241
6651
 
6652
+ int64_t min_compute_capability = INT_MAX;
6653
+ for (int64_t id = 0; id < g_device_count; ++id) {
6654
+ if (min_compute_capability > g_compute_capabilities[id]
6655
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6656
+ min_compute_capability = g_compute_capabilities[id];
6657
+ }
6658
+ }
6659
+
6242
6660
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6243
6661
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6244
6662
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6245
6663
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6246
6664
  }else if (src0->type == GGML_TYPE_F32) {
6247
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6665
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6248
6666
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6249
6667
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6250
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6251
- } else {
6252
- int min_compute_capability = INT_MAX;
6253
- for (int id = 0; id < g_device_count; ++id) {
6254
- if (min_compute_capability > g_compute_capabilities[id]
6255
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6256
- min_compute_capability = g_compute_capabilities[id];
6257
- }
6258
- }
6259
6668
 
6669
+ #ifdef GGML_CUDA_FORCE_DMMV
6670
+ const bool use_mul_mat_vec_q = false;
6671
+ #else
6672
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6673
+ #endif // GGML_CUDA_FORCE_DMMV
6674
+
6675
+ if (use_mul_mat_vec_q) {
6676
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6677
+ } else {
6678
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6679
+ }
6680
+ } else {
6260
6681
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6261
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6682
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6262
6683
  } else {
6263
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6684
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6264
6685
  }
6265
6686
  }
6266
6687
  } else {
@@ -6269,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6269
6690
  }
6270
6691
 
6271
6692
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6272
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6273
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6693
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6274
6694
  }
6275
6695
 
6276
6696
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6299,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6299
6719
  const int64_t nb11 = src1->nb[1];
6300
6720
  const int64_t nb12 = src1->nb[2];
6301
6721
 
6302
- CUDA_CHECK(cudaSetDevice(g_main_device));
6303
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6722
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6723
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6304
6724
 
6305
6725
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6306
6726
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6310,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6310
6730
 
6311
6731
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6312
6732
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6313
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6733
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6314
6734
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6315
6735
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6316
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6736
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6317
6737
  } else {
6318
6738
  GGML_ASSERT(false);
6319
6739
  }
@@ -6327,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6327
6747
  }
6328
6748
 
6329
6749
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6331
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6750
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6332
6751
  }
6333
6752
 
6334
6753
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6335
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6336
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6754
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6337
6755
  }
6338
6756
 
6339
6757
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6340
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
6758
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6342
-
6343
- const int mode = ((int32_t *) dst->op_params)[2];
6344
- const bool is_glm = mode & 4;
6345
-
6346
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6759
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6347
6760
  }
6348
6761
 
6349
6762
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6350
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6351
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6763
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6352
6764
  }
6353
6765
 
6354
6766
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6358,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6358
6770
  }
6359
6771
 
6360
6772
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6361
- int nrows = ggml_nrows(tensor);
6773
+ const int64_t nrows = ggml_nrows(tensor);
6362
6774
 
6363
6775
  const int64_t ne0 = tensor->ne[0];
6364
6776
 
@@ -6368,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6368
6780
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6369
6781
  memset(extra, 0, sizeof(*extra));
6370
6782
 
6371
- for (int id = 0; id < g_device_count; ++id) {
6783
+ for (int64_t id = 0; id < g_device_count; ++id) {
6372
6784
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6373
6785
  continue;
6374
6786
  }
6375
6787
 
6376
- cudaSetDevice(id);
6788
+ ggml_cuda_set_device(id);
6377
6789
 
6378
- int row_low, row_high;
6790
+ int64_t row_low, row_high;
6379
6791
  if (backend == GGML_BACKEND_GPU) {
6380
6792
  row_low = 0;
6381
6793
  row_high = nrows;
@@ -6425,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6425
6837
  extra->data_device[id] = buf;
6426
6838
 
6427
6839
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6428
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6840
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6841
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6842
+ }
6429
6843
  }
6430
6844
  }
6431
6845
 
@@ -6439,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6439
6853
 
6440
6854
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6441
6855
 
6442
- for (int id = 0; id < g_device_count; ++id) {
6856
+ for (int64_t id = 0; id < g_device_count; ++id) {
6443
6857
  if (extra->data_device[id] != nullptr) {
6444
- CUDA_CHECK(cudaSetDevice(id));
6858
+ CUDA_CHECK(ggml_cuda_set_device(id));
6445
6859
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6446
6860
  }
6447
6861
 
6448
- if (extra->events[id] != nullptr) {
6449
- CUDA_CHECK(cudaSetDevice(id));
6450
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6862
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6863
+ if (extra->events[id][is] != nullptr) {
6864
+ CUDA_CHECK(ggml_cuda_set_device(id));
6865
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6866
+ }
6451
6867
  }
6452
6868
  }
6453
6869
 
@@ -6499,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6499
6915
  force_inplace;
6500
6916
  const size_t size = ggml_nbytes(tensor);
6501
6917
 
6502
- CUDA_CHECK(cudaSetDevice(g_main_device));
6918
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6503
6919
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6504
6920
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6505
6921
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];