llama_cpp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@
13
13
  #ifdef __HIP_PLATFORM_AMD__
14
14
  // for rocblas_initialize()
15
15
  #include "rocblas/rocblas.h"
16
- #endif
16
+ #endif // __HIP_PLATFORM_AMD__
17
17
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
18
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
19
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@@ -68,25 +68,52 @@
68
68
  #include <cuda_runtime.h>
69
69
  #include <cublas_v2.h>
70
70
  #include <cuda_fp16.h>
71
- #endif
71
+ #endif // defined(GGML_USE_HIPBLAS)
72
72
 
73
73
  #include "ggml-cuda.h"
74
74
  #include "ggml.h"
75
75
 
76
- #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
- #ifndef CC_TURING
78
- #define CC_TURING 700
79
- #endif
76
+ #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #define CC_TURING 700
78
+ #define CC_OFFSET_AMD 1000000
79
+ #define CC_RDNA2 CC_OFFSET_AMD + 1030
80
80
 
81
81
  #if defined(GGML_USE_HIPBLAS)
82
82
  #define __CUDA_ARCH__ 1300
83
83
 
84
+ #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
85
+ defined(__gfx1150__) || defined(__gfx1151__)
86
+ #define RDNA3
87
+ #endif
88
+
89
+ #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
90
+ defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
91
+ #define RDNA2
92
+ #endif
93
+
94
+ #ifndef __has_builtin
95
+ #define __has_builtin(x) 0
96
+ #endif
97
+
84
98
  typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
99
  static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
100
  const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
101
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
102
+ #if __has_builtin(__builtin_elementwise_sub_sat)
88
103
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
104
  return reinterpret_cast<const int&>(c);
105
+ #else
106
+ int8x4_t c;
107
+ int16_t tmp;
108
+ #pragma unroll
109
+ for (int i = 0; i < 4; i++) {
110
+ tmp = va[i] - vb[i];
111
+ if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
112
+ if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
113
+ c[i] = tmp;
114
+ }
115
+ return reinterpret_cast<int&>(c);
116
+ #endif // __has_builtin(__builtin_elementwise_sub_sat)
90
117
  }
91
118
 
92
119
  static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
@@ -115,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
115
142
  #endif
116
143
  return c;
117
144
  }
118
- #endif
145
+ #endif // defined(GGML_USE_HIPBLAS)
119
146
 
120
147
  #if defined(_MSC_VER)
121
148
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -127,8 +154,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
127
154
  do { \
128
155
  cudaError_t err_ = (err); \
129
156
  if (err_ != cudaSuccess) { \
130
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
157
+ int id; \
158
+ cudaGetDevice(&id); \
159
+ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
131
160
  cudaGetErrorString(err_)); \
161
+ fprintf(stderr, "current device: %d\n", id); \
132
162
  exit(1); \
133
163
  } \
134
164
  } while (0)
@@ -138,8 +168,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
138
168
  do { \
139
169
  cublasStatus_t err_ = (err); \
140
170
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
171
+ int id; \
172
+ cudaGetDevice(&id); \
141
173
  fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
142
174
  err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
175
+ fprintf(stderr, "current device: %d\n", id); \
143
176
  exit(1); \
144
177
  } \
145
178
  } while (0)
@@ -148,7 +181,10 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
148
181
  do { \
149
182
  cublasStatus_t err_ = (err); \
150
183
  if (err_ != CUBLAS_STATUS_SUCCESS) { \
184
+ int id; \
185
+ cudaGetDevice(&id); \
151
186
  fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
187
+ fprintf(stderr, "current device: %d\n", id); \
152
188
  exit(1); \
153
189
  } \
154
190
  } while (0)
@@ -195,10 +231,13 @@ typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__
195
231
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
196
232
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
197
233
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
198
- typedef void (*ggml_cuda_op_t)(
199
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
200
- float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
201
- cudaStream_t & cudaStream_main);
234
+ typedef void (*ggml_cuda_op_mul_mat_t)(
235
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
236
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
237
+ const int64_t src1_padded_row_size, const cudaStream_t & stream);
238
+ typedef void (*ggml_cuda_op_flatten_t)(
239
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
240
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
202
241
 
203
242
  // QK = number of values after dequantization
204
243
  // QR = QK / number of values before dequantization
@@ -379,11 +418,29 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
379
418
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
380
419
  #endif
381
420
 
421
+ #define MUL_MAT_SRC1_COL_STRIDE 128
422
+
423
+ #define MAX_STREAMS 8
424
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
425
+
382
426
  struct ggml_tensor_extra_gpu {
383
427
  void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
384
- cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
428
+ cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
385
429
  };
386
430
 
431
+ // this is faster on Windows
432
+ // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
433
+ inline cudaError_t ggml_cuda_set_device(const int device) {
434
+ int current_device;
435
+ CUDA_CHECK(cudaGetDevice(&current_device));
436
+
437
+ if (device == current_device) {
438
+ return cudaSuccess;
439
+ }
440
+
441
+ return cudaSetDevice(device);
442
+ }
443
+
387
444
  static int g_device_count = -1;
388
445
  static int g_main_device = 0;
389
446
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
@@ -396,8 +453,6 @@ static size_t g_scratch_offset = 0;
396
453
 
397
454
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
398
455
 
399
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
400
-
401
456
  static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
402
457
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
403
458
 
@@ -447,58 +502,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
447
502
  dst[i] = x[i] / (1.0f + expf(-x[i]));
448
503
  }
449
504
 
505
+ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
506
+ #pragma unroll
507
+ for (int mask = 16; mask > 0; mask >>= 1) {
508
+ a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
509
+ a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
510
+ }
511
+ return a;
512
+ }
513
+
514
+ template <int block_size>
450
515
  static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
451
516
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
452
517
  const int tid = threadIdx.x;
453
518
 
454
519
  const float eps = 1e-5f;
455
520
 
456
- float mean = 0.0f;
457
- float var = 0.0f;
521
+ float2 mean_var = make_float2(0.f, 0.f);
458
522
 
459
- for (int col = tid; col < ncols; col += WARP_SIZE) {
523
+ for (int col = tid; col < ncols; col += block_size) {
460
524
  const float xi = x[row*ncols + col];
461
- mean += xi;
462
- var += xi * xi;
525
+ mean_var.x += xi;
526
+ mean_var.y += xi * xi;
463
527
  }
464
528
 
465
529
  // sum up partial sums
466
- #pragma unroll
467
- for (int mask = 16; mask > 0; mask >>= 1) {
468
- mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
469
- var += __shfl_xor_sync(0xffffffff, var, mask, 32);
530
+ mean_var = warp_reduce_sum(mean_var);
531
+ if (block_size > WARP_SIZE) {
532
+ __shared__ float2 s_sum[32];
533
+ int warp_id = threadIdx.x / WARP_SIZE;
534
+ int lane_id = threadIdx.x % WARP_SIZE;
535
+ if (lane_id == 0) {
536
+ s_sum[warp_id] = mean_var;
537
+ }
538
+ __syncthreads();
539
+ mean_var = s_sum[lane_id];
540
+ mean_var = warp_reduce_sum(mean_var);
470
541
  }
471
542
 
472
- mean /= ncols;
473
- var = var / ncols - mean * mean;
474
- const float inv_var = rsqrtf(var + eps);
543
+ const float mean = mean_var.x / ncols;
544
+ const float var = mean_var.y / ncols - mean * mean;
545
+ const float inv_std = rsqrtf(var + eps);
475
546
 
476
- for (int col = tid; col < ncols; col += WARP_SIZE) {
477
- dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
547
+ for (int col = tid; col < ncols; col += block_size) {
548
+ dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
549
+ }
550
+ }
551
+
552
+ static __device__ __forceinline__ float warp_reduce_sum(float x) {
553
+ #pragma unroll
554
+ for (int mask = 16; mask > 0; mask >>= 1) {
555
+ x += __shfl_xor_sync(0xffffffff, x, mask, 32);
478
556
  }
557
+ return x;
479
558
  }
480
559
 
560
+ template <int block_size>
481
561
  static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
482
562
  const int row = blockIdx.x*blockDim.y + threadIdx.y;
483
563
  const int tid = threadIdx.x;
484
564
 
485
565
  float tmp = 0.0f; // partial sum for thread in warp
486
566
 
487
- for (int col = tid; col < ncols; col += WARP_SIZE) {
567
+ for (int col = tid; col < ncols; col += block_size) {
488
568
  const float xi = x[row*ncols + col];
489
569
  tmp += xi * xi;
490
570
  }
491
571
 
492
572
  // sum up partial sums
493
- #pragma unroll
494
- for (int mask = 16; mask > 0; mask >>= 1) {
495
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
573
+ tmp = warp_reduce_sum(tmp);
574
+ if (block_size > WARP_SIZE) {
575
+ __shared__ float s_sum[32];
576
+ int warp_id = threadIdx.x / WARP_SIZE;
577
+ int lane_id = threadIdx.x % WARP_SIZE;
578
+ if (lane_id == 0) {
579
+ s_sum[warp_id] = tmp;
580
+ }
581
+ __syncthreads();
582
+ tmp = s_sum[lane_id];
583
+ tmp = warp_reduce_sum(tmp);
496
584
  }
497
585
 
498
586
  const float mean = tmp / ncols;
499
587
  const float scale = rsqrtf(mean + eps);
500
588
 
501
- for (int col = tid; col < ncols; col += WARP_SIZE) {
589
+ for (int col = tid; col < ncols; col += block_size) {
502
590
  dst[row*ncols + col] = scale * x[row*ncols + col];
503
591
  }
504
592
  }
@@ -3394,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
3394
3482
  }
3395
3483
  }
3396
3484
 
3485
+ #define MMQ_X_Q4_0_RDNA2 64
3486
+ #define MMQ_Y_Q4_0_RDNA2 128
3487
+ #define NWARPS_Q4_0_RDNA2 8
3488
+ #define MMQ_X_Q4_0_RDNA1 64
3489
+ #define MMQ_Y_Q4_0_RDNA1 64
3490
+ #define NWARPS_Q4_0_RDNA1 8
3397
3491
  #define MMQ_X_Q4_0_AMPERE 64
3398
3492
  #define MMQ_Y_Q4_0_AMPERE 128
3399
3493
  #define NWARPS_Q4_0_AMPERE 4
@@ -3401,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
3401
3495
  #define MMQ_Y_Q4_0_PASCAL 64
3402
3496
  #define NWARPS_Q4_0_PASCAL 8
3403
3497
 
3404
- template <bool need_check> static __global__ void mul_mat_q4_0(
3498
+ template <bool need_check> static __global__ void
3499
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3500
+ #if defined(RDNA3) || defined(RDNA2)
3501
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
3502
+ #endif // defined(RDNA3) || defined(RDNA2)
3503
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3504
+ mul_mat_q4_0(
3405
3505
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3406
3506
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3407
3507
 
3408
- #if __CUDA_ARCH__ >= CC_TURING
3508
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3509
+ #if defined(RDNA3) || defined(RDNA2)
3510
+ const int mmq_x = MMQ_X_Q4_0_RDNA2;
3511
+ const int mmq_y = MMQ_Y_Q4_0_RDNA2;
3512
+ const int nwarps = NWARPS_Q4_0_RDNA2;
3513
+ #else
3514
+ const int mmq_x = MMQ_X_Q4_0_RDNA1;
3515
+ const int mmq_y = MMQ_Y_Q4_0_RDNA1;
3516
+ const int nwarps = NWARPS_Q4_0_RDNA1;
3517
+ #endif // defined(RDNA3) || defined(RDNA2)
3518
+
3519
+ mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
3520
+ load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
3521
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3522
+
3523
+ #elif __CUDA_ARCH__ >= CC_TURING
3409
3524
  const int mmq_x = MMQ_X_Q4_0_AMPERE;
3410
3525
  const int mmq_y = MMQ_Y_Q4_0_AMPERE;
3411
3526
  const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -3428,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3428
3543
  #endif // __CUDA_ARCH__ >= CC_TURING
3429
3544
  }
3430
3545
 
3546
+ #define MMQ_X_Q4_1_RDNA2 64
3547
+ #define MMQ_Y_Q4_1_RDNA2 128
3548
+ #define NWARPS_Q4_1_RDNA2 8
3549
+ #define MMQ_X_Q4_1_RDNA1 64
3550
+ #define MMQ_Y_Q4_1_RDNA1 64
3551
+ #define NWARPS_Q4_1_RDNA1 8
3431
3552
  #define MMQ_X_Q4_1_AMPERE 64
3432
3553
  #define MMQ_Y_Q4_1_AMPERE 128
3433
3554
  #define NWARPS_Q4_1_AMPERE 4
@@ -3436,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
3436
3557
  #define NWARPS_Q4_1_PASCAL 8
3437
3558
 
3438
3559
  template <bool need_check> static __global__ void
3439
- #if __CUDA_ARCH__ < CC_TURING
3560
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3561
+ #if defined(RDNA3) || defined(RDNA2)
3562
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
3563
+ #endif // defined(RDNA3) || defined(RDNA2)
3564
+ #elif __CUDA_ARCH__ < CC_TURING
3440
3565
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
3441
3566
  #endif // __CUDA_ARCH__ < CC_TURING
3442
3567
  mul_mat_q4_1(
3443
3568
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3444
3569
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3445
3570
 
3446
- #if __CUDA_ARCH__ >= CC_TURING
3571
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3572
+ #if defined(RDNA3) || defined(RDNA2)
3573
+ const int mmq_x = MMQ_X_Q4_1_RDNA2;
3574
+ const int mmq_y = MMQ_Y_Q4_1_RDNA2;
3575
+ const int nwarps = NWARPS_Q4_1_RDNA2;
3576
+ #else
3577
+ const int mmq_x = MMQ_X_Q4_1_RDNA1;
3578
+ const int mmq_y = MMQ_Y_Q4_1_RDNA1;
3579
+ const int nwarps = NWARPS_Q4_1_RDNA1;
3580
+ #endif // defined(RDNA3) || defined(RDNA2)
3581
+
3582
+ mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
3583
+ load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
3584
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3585
+
3586
+ #elif __CUDA_ARCH__ >= CC_TURING
3447
3587
  const int mmq_x = MMQ_X_Q4_1_AMPERE;
3448
3588
  const int mmq_y = MMQ_Y_Q4_1_AMPERE;
3449
3589
  const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -3466,6 +3606,12 @@ template <bool need_check> static __global__ void
3466
3606
  #endif // __CUDA_ARCH__ >= CC_TURING
3467
3607
  }
3468
3608
 
3609
+ #define MMQ_X_Q5_0_RDNA2 64
3610
+ #define MMQ_Y_Q5_0_RDNA2 128
3611
+ #define NWARPS_Q5_0_RDNA2 8
3612
+ #define MMQ_X_Q5_0_RDNA1 64
3613
+ #define MMQ_Y_Q5_0_RDNA1 64
3614
+ #define NWARPS_Q5_0_RDNA1 8
3469
3615
  #define MMQ_X_Q5_0_AMPERE 128
3470
3616
  #define MMQ_Y_Q5_0_AMPERE 64
3471
3617
  #define NWARPS_Q5_0_AMPERE 4
@@ -3473,11 +3619,32 @@ template <bool need_check> static __global__ void
3473
3619
  #define MMQ_Y_Q5_0_PASCAL 64
3474
3620
  #define NWARPS_Q5_0_PASCAL 8
3475
3621
 
3476
- template <bool need_check> static __global__ void mul_mat_q5_0(
3622
+ template <bool need_check> static __global__ void
3623
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3624
+ #if defined(RDNA3) || defined(RDNA2)
3625
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
3626
+ #endif // defined(RDNA3) || defined(RDNA2)
3627
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3628
+ mul_mat_q5_0(
3477
3629
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3478
3630
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3479
3631
 
3480
- #if __CUDA_ARCH__ >= CC_TURING
3632
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3633
+ #if defined(RDNA3) || defined(RDNA2)
3634
+ const int mmq_x = MMQ_X_Q5_0_RDNA2;
3635
+ const int mmq_y = MMQ_Y_Q5_0_RDNA2;
3636
+ const int nwarps = NWARPS_Q5_0_RDNA2;
3637
+ #else
3638
+ const int mmq_x = MMQ_X_Q5_0_RDNA1;
3639
+ const int mmq_y = MMQ_Y_Q5_0_RDNA1;
3640
+ const int nwarps = NWARPS_Q5_0_RDNA1;
3641
+ #endif // defined(RDNA3) || defined(RDNA2)
3642
+
3643
+ mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
3644
+ load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
3645
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3646
+
3647
+ #elif __CUDA_ARCH__ >= CC_TURING
3481
3648
  const int mmq_x = MMQ_X_Q5_0_AMPERE;
3482
3649
  const int mmq_y = MMQ_Y_Q5_0_AMPERE;
3483
3650
  const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -3500,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3500
3667
  #endif // __CUDA_ARCH__ >= CC_TURING
3501
3668
  }
3502
3669
 
3670
+ #define MMQ_X_Q5_1_RDNA2 64
3671
+ #define MMQ_Y_Q5_1_RDNA2 128
3672
+ #define NWARPS_Q5_1_RDNA2 8
3673
+ #define MMQ_X_Q5_1_RDNA1 64
3674
+ #define MMQ_Y_Q5_1_RDNA1 64
3675
+ #define NWARPS_Q5_1_RDNA1 8
3503
3676
  #define MMQ_X_Q5_1_AMPERE 128
3504
3677
  #define MMQ_Y_Q5_1_AMPERE 64
3505
3678
  #define NWARPS_Q5_1_AMPERE 4
@@ -3507,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
3507
3680
  #define MMQ_Y_Q5_1_PASCAL 64
3508
3681
  #define NWARPS_Q5_1_PASCAL 8
3509
3682
 
3510
- template <bool need_check> static __global__ void mul_mat_q5_1(
3683
+ template <bool need_check> static __global__ void
3684
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3685
+ #if defined(RDNA3) || defined(RDNA2)
3686
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
3687
+ #endif // defined(RDNA3) || defined(RDNA2)
3688
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3689
+ mul_mat_q5_1(
3511
3690
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3512
3691
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3513
3692
 
3514
- #if __CUDA_ARCH__ >= CC_TURING
3693
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3694
+ #if defined(RDNA3) || defined(RDNA2)
3695
+ const int mmq_x = MMQ_X_Q5_1_RDNA2;
3696
+ const int mmq_y = MMQ_Y_Q5_1_RDNA2;
3697
+ const int nwarps = NWARPS_Q5_1_RDNA2;
3698
+ #else
3699
+ const int mmq_x = MMQ_X_Q5_1_RDNA1;
3700
+ const int mmq_y = MMQ_Y_Q5_1_RDNA1;
3701
+ const int nwarps = NWARPS_Q5_1_RDNA1;
3702
+ #endif // defined(RDNA3) || defined(RDNA2)
3703
+
3704
+ mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
3705
+ load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
3706
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3707
+
3708
+ #elif __CUDA_ARCH__ >= CC_TURING
3515
3709
  const int mmq_x = MMQ_X_Q5_1_AMPERE;
3516
3710
  const int mmq_y = MMQ_Y_Q5_1_AMPERE;
3517
3711
  const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -3534,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3534
3728
  #endif // __CUDA_ARCH__ >= CC_TURING
3535
3729
  }
3536
3730
 
3731
+ #define MMQ_X_Q8_0_RDNA2 64
3732
+ #define MMQ_Y_Q8_0_RDNA2 128
3733
+ #define NWARPS_Q8_0_RDNA2 8
3734
+ #define MMQ_X_Q8_0_RDNA1 64
3735
+ #define MMQ_Y_Q8_0_RDNA1 64
3736
+ #define NWARPS_Q8_0_RDNA1 8
3537
3737
  #define MMQ_X_Q8_0_AMPERE 128
3538
3738
  #define MMQ_Y_Q8_0_AMPERE 64
3539
3739
  #define NWARPS_Q8_0_AMPERE 4
@@ -3541,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
3541
3741
  #define MMQ_Y_Q8_0_PASCAL 64
3542
3742
  #define NWARPS_Q8_0_PASCAL 8
3543
3743
 
3544
- template <bool need_check> static __global__ void mul_mat_q8_0(
3744
+ template <bool need_check> static __global__ void
3745
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3746
+ #if defined(RDNA3) || defined(RDNA2)
3747
+ __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
3748
+ #endif // defined(RDNA3) || defined(RDNA2)
3749
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3750
+ mul_mat_q8_0(
3545
3751
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3546
3752
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3547
3753
 
3548
- #if __CUDA_ARCH__ >= CC_TURING
3754
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3755
+ #if defined(RDNA3) || defined(RDNA2)
3756
+ const int mmq_x = MMQ_X_Q8_0_RDNA2;
3757
+ const int mmq_y = MMQ_Y_Q8_0_RDNA2;
3758
+ const int nwarps = NWARPS_Q8_0_RDNA2;
3759
+ #else
3760
+ const int mmq_x = MMQ_X_Q8_0_RDNA1;
3761
+ const int mmq_y = MMQ_Y_Q8_0_RDNA1;
3762
+ const int nwarps = NWARPS_Q8_0_RDNA1;
3763
+ #endif // defined(RDNA3) || defined(RDNA2)
3764
+
3765
+ mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
3766
+ load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
3767
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3768
+
3769
+ #elif __CUDA_ARCH__ >= CC_TURING
3549
3770
  const int mmq_x = MMQ_X_Q8_0_AMPERE;
3550
3771
  const int mmq_y = MMQ_Y_Q8_0_AMPERE;
3551
3772
  const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -3568,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3568
3789
  #endif // __CUDA_ARCH__ >= CC_TURING
3569
3790
  }
3570
3791
 
3792
+ #define MMQ_X_Q2_K_RDNA2 64
3793
+ #define MMQ_Y_Q2_K_RDNA2 128
3794
+ #define NWARPS_Q2_K_RDNA2 8
3795
+ #define MMQ_X_Q2_K_RDNA1 128
3796
+ #define MMQ_Y_Q2_K_RDNA1 32
3797
+ #define NWARPS_Q2_K_RDNA1 8
3571
3798
  #define MMQ_X_Q2_K_AMPERE 64
3572
3799
  #define MMQ_Y_Q2_K_AMPERE 128
3573
3800
  #define NWARPS_Q2_K_AMPERE 4
@@ -3575,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
3575
3802
  #define MMQ_Y_Q2_K_PASCAL 64
3576
3803
  #define NWARPS_Q2_K_PASCAL 8
3577
3804
 
3578
- template <bool need_check> static __global__ void mul_mat_q2_K(
3805
+ template <bool need_check> static __global__ void
3806
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3807
+ #if defined(RDNA3) || defined(RDNA2)
3808
+ __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
3809
+ #endif // defined(RDNA3) || defined(RDNA2)
3810
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3811
+ mul_mat_q2_K(
3579
3812
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3580
3813
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3581
3814
 
3582
- #if __CUDA_ARCH__ >= CC_TURING
3815
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3816
+ #if defined(RDNA3) || defined(RDNA2)
3817
+ const int mmq_x = MMQ_X_Q2_K_RDNA2;
3818
+ const int mmq_y = MMQ_Y_Q2_K_RDNA2;
3819
+ const int nwarps = NWARPS_Q2_K_RDNA2;
3820
+ #else
3821
+ const int mmq_x = MMQ_X_Q2_K_RDNA1;
3822
+ const int mmq_y = MMQ_Y_Q2_K_RDNA1;
3823
+ const int nwarps = NWARPS_Q2_K_RDNA1;
3824
+ #endif // defined(RDNA3) || defined(RDNA2)
3825
+
3826
+ mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
3827
+ load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
3828
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3829
+
3830
+ #elif __CUDA_ARCH__ >= CC_TURING
3583
3831
  const int mmq_x = MMQ_X_Q2_K_AMPERE;
3584
3832
  const int mmq_y = MMQ_Y_Q2_K_AMPERE;
3585
3833
  const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -3602,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3602
3850
  #endif // __CUDA_ARCH__ >= CC_TURING
3603
3851
  }
3604
3852
 
3853
+ #define MMQ_X_Q3_K_RDNA2 128
3854
+ #define MMQ_Y_Q3_K_RDNA2 64
3855
+ #define NWARPS_Q3_K_RDNA2 8
3856
+ #define MMQ_X_Q3_K_RDNA1 32
3857
+ #define MMQ_Y_Q3_K_RDNA1 128
3858
+ #define NWARPS_Q3_K_RDNA1 8
3605
3859
  #define MMQ_X_Q3_K_AMPERE 128
3606
3860
  #define MMQ_Y_Q3_K_AMPERE 128
3607
3861
  #define NWARPS_Q3_K_AMPERE 4
@@ -3610,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
3610
3864
  #define NWARPS_Q3_K_PASCAL 8
3611
3865
 
3612
3866
  template <bool need_check> static __global__ void
3613
- #if __CUDA_ARCH__ < CC_TURING
3867
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3868
+ #if defined(RDNA3) || defined(RDNA2)
3869
+ __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
3870
+ #endif // defined(RDNA3) || defined(RDNA2)
3871
+ #elif __CUDA_ARCH__ < CC_TURING
3614
3872
  __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
3615
3873
  #endif // __CUDA_ARCH__ < CC_TURING
3616
3874
  mul_mat_q3_K(
3617
3875
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3618
3876
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3619
3877
 
3620
- #if __CUDA_ARCH__ >= CC_TURING
3878
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3879
+ #if defined(RDNA3) || defined(RDNA2)
3880
+ const int mmq_x = MMQ_X_Q3_K_RDNA2;
3881
+ const int mmq_y = MMQ_Y_Q3_K_RDNA2;
3882
+ const int nwarps = NWARPS_Q3_K_RDNA2;
3883
+ #else
3884
+ const int mmq_x = MMQ_X_Q3_K_RDNA1;
3885
+ const int mmq_y = MMQ_Y_Q3_K_RDNA1;
3886
+ const int nwarps = NWARPS_Q3_K_RDNA1;
3887
+ #endif // defined(RDNA3) || defined(RDNA2)
3888
+
3889
+ mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
3890
+ load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
3891
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3892
+
3893
+ #elif __CUDA_ARCH__ >= CC_TURING
3621
3894
  const int mmq_x = MMQ_X_Q3_K_AMPERE;
3622
3895
  const int mmq_y = MMQ_Y_Q3_K_AMPERE;
3623
3896
  const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -3640,6 +3913,12 @@ template <bool need_check> static __global__ void
3640
3913
  #endif // __CUDA_ARCH__ >= CC_TURING
3641
3914
  }
3642
3915
 
3916
+ #define MMQ_X_Q4_K_RDNA2 64
3917
+ #define MMQ_Y_Q4_K_RDNA2 128
3918
+ #define NWARPS_Q4_K_RDNA2 8
3919
+ #define MMQ_X_Q4_K_RDNA1 32
3920
+ #define MMQ_Y_Q4_K_RDNA1 64
3921
+ #define NWARPS_Q4_K_RDNA1 8
3643
3922
  #define MMQ_X_Q4_K_AMPERE 64
3644
3923
  #define MMQ_Y_Q4_K_AMPERE 128
3645
3924
  #define NWARPS_Q4_K_AMPERE 4
@@ -3648,14 +3927,33 @@ template <bool need_check> static __global__ void
3648
3927
  #define NWARPS_Q4_K_PASCAL 8
3649
3928
 
3650
3929
  template <bool need_check> static __global__ void
3651
- #if __CUDA_ARCH__ < CC_TURING
3930
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3931
+ #if defined(RDNA3) || defined(RDNA2)
3932
+ __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
3933
+ #endif // defined(RDNA3) || defined(RDNA2)
3934
+ #elif __CUDA_ARCH__ < CC_TURING
3652
3935
  __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
3653
3936
  #endif // __CUDA_ARCH__ < CC_TURING
3654
3937
  mul_mat_q4_K(
3655
3938
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3656
3939
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3657
3940
 
3658
- #if __CUDA_ARCH__ >= CC_TURING
3941
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3942
+ #if defined(RDNA3) || defined(RDNA2)
3943
+ const int mmq_x = MMQ_X_Q4_K_RDNA2;
3944
+ const int mmq_y = MMQ_Y_Q4_K_RDNA2;
3945
+ const int nwarps = NWARPS_Q4_K_RDNA2;
3946
+ #else
3947
+ const int mmq_x = MMQ_X_Q4_K_RDNA1;
3948
+ const int mmq_y = MMQ_Y_Q4_K_RDNA1;
3949
+ const int nwarps = NWARPS_Q4_K_RDNA1;
3950
+ #endif // defined(RDNA3) || defined(RDNA2)
3951
+
3952
+ mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
3953
+ load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
3954
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
+
3956
+ #elif __CUDA_ARCH__ >= CC_TURING
3659
3957
  const int mmq_x = MMQ_X_Q4_K_AMPERE;
3660
3958
  const int mmq_y = MMQ_Y_Q4_K_AMPERE;
3661
3959
  const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -3678,6 +3976,12 @@ template <bool need_check> static __global__ void
3678
3976
  #endif // __CUDA_ARCH__ >= CC_TURING
3679
3977
  }
3680
3978
 
3979
+ #define MMQ_X_Q5_K_RDNA2 64
3980
+ #define MMQ_Y_Q5_K_RDNA2 128
3981
+ #define NWARPS_Q5_K_RDNA2 8
3982
+ #define MMQ_X_Q5_K_RDNA1 32
3983
+ #define MMQ_Y_Q5_K_RDNA1 64
3984
+ #define NWARPS_Q5_K_RDNA1 8
3681
3985
  #define MMQ_X_Q5_K_AMPERE 64
3682
3986
  #define MMQ_Y_Q5_K_AMPERE 128
3683
3987
  #define NWARPS_Q5_K_AMPERE 4
@@ -3685,11 +3989,32 @@ template <bool need_check> static __global__ void
3685
3989
  #define MMQ_Y_Q5_K_PASCAL 64
3686
3990
  #define NWARPS_Q5_K_PASCAL 8
3687
3991
 
3688
- template <bool need_check> static __global__ void mul_mat_q5_K(
3992
+ template <bool need_check> static __global__ void
3993
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3994
+ #if defined(RDNA3) || defined(RDNA2)
3995
+ __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
3996
+ #endif // defined(RDNA3) || defined(RDNA2)
3997
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
3998
+ mul_mat_q5_K(
3689
3999
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3690
4000
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3691
4001
 
3692
- #if __CUDA_ARCH__ >= CC_TURING
4002
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4003
+ #if defined(RDNA3) || defined(RDNA2)
4004
+ const int mmq_x = MMQ_X_Q5_K_RDNA2;
4005
+ const int mmq_y = MMQ_Y_Q5_K_RDNA2;
4006
+ const int nwarps = NWARPS_Q5_K_RDNA2;
4007
+ #else
4008
+ const int mmq_x = MMQ_X_Q5_K_RDNA1;
4009
+ const int mmq_y = MMQ_Y_Q5_K_RDNA1;
4010
+ const int nwarps = NWARPS_Q5_K_RDNA1;
4011
+ #endif // defined(RDNA3) || defined(RDNA2)
4012
+
4013
+ mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
4014
+ load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
4015
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4016
+
4017
+ #elif __CUDA_ARCH__ >= CC_TURING
3693
4018
  const int mmq_x = MMQ_X_Q5_K_AMPERE;
3694
4019
  const int mmq_y = MMQ_Y_Q5_K_AMPERE;
3695
4020
  const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -3712,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3712
4037
  #endif // __CUDA_ARCH__ >= CC_TURING
3713
4038
  }
3714
4039
 
4040
+ #define MMQ_X_Q6_K_RDNA2 64
4041
+ #define MMQ_Y_Q6_K_RDNA2 128
4042
+ #define NWARPS_Q6_K_RDNA2 8
4043
+ #define MMQ_X_Q6_K_RDNA1 32
4044
+ #define MMQ_Y_Q6_K_RDNA1 64
4045
+ #define NWARPS_Q6_K_RDNA1 8
3715
4046
  #define MMQ_X_Q6_K_AMPERE 64
3716
4047
  #define MMQ_Y_Q6_K_AMPERE 64
3717
4048
  #define NWARPS_Q6_K_AMPERE 4
@@ -3720,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
3720
4051
  #define NWARPS_Q6_K_PASCAL 8
3721
4052
 
3722
4053
  template <bool need_check> static __global__ void
3723
- #if __CUDA_ARCH__ < CC_TURING
4054
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4055
+ #if defined(RDNA3) || defined(RDNA2)
4056
+ __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
4057
+ #endif // defined(RDNA3) || defined(RDNA2)
4058
+ #elif __CUDA_ARCH__ < CC_TURING
3724
4059
  __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
3725
4060
  #endif // __CUDA_ARCH__ < CC_TURING
3726
4061
  mul_mat_q6_K(
3727
4062
  const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
3728
4063
  const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
3729
4064
 
3730
- #if __CUDA_ARCH__ >= CC_TURING
4065
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
4066
+ #if defined(RDNA3) || defined(RDNA2)
4067
+ const int mmq_x = MMQ_X_Q6_K_RDNA2;
4068
+ const int mmq_y = MMQ_Y_Q6_K_RDNA2;
4069
+ const int nwarps = NWARPS_Q6_K_RDNA2;
4070
+ #else
4071
+ const int mmq_x = MMQ_X_Q6_K_RDNA1;
4072
+ const int mmq_y = MMQ_Y_Q6_K_RDNA1;
4073
+ const int nwarps = NWARPS_Q6_K_RDNA1;
4074
+ #endif // defined(RDNA3) || defined(RDNA2)
4075
+
4076
+ mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
4077
+ load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
4078
+ (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4079
+
4080
+ #elif __CUDA_ARCH__ >= CC_TURING
3731
4081
  const int mmq_x = MMQ_X_Q6_K_AMPERE;
3732
4082
  const int mmq_y = MMQ_Y_Q6_K_AMPERE;
3733
4083
  const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -4036,7 +4386,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4036
4386
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4037
4387
  }
4038
4388
 
4039
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
4389
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4040
4391
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4041
4392
  const int half_n_dims = ncols/4;
4042
4393
 
@@ -4048,8 +4399,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4048
4399
  const int i = row*ncols + col;
4049
4400
 
4050
4401
  const float col_theta_scale = powf(theta_scale, col);
4402
+ const float p = p0 + p_delta*(row/p_delta_rows);
4051
4403
 
4052
- const float theta = p*col_theta_scale;
4404
+ const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4053
4405
  const float sin_theta = sinf(theta);
4054
4406
  const float cos_theta = cosf(theta);
4055
4407
 
@@ -4059,7 +4411,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4059
4411
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4060
4412
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4061
4413
 
4062
- const float block_theta = block_p*col_theta_scale;
4414
+ const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4063
4415
  const float sin_block_theta = sinf(block_theta);
4064
4416
  const float cos_block_theta = cosf(block_theta);
4065
4417
 
@@ -4186,14 +4538,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
4186
4538
 
4187
4539
  static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4188
4540
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4189
- const dim3 block_dims(WARP_SIZE, 1, 1);
4190
- norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4541
+ if (ncols < 1024) {
4542
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4543
+ norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4544
+ } else {
4545
+ const dim3 block_dims(1024, 1, 1);
4546
+ norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
4547
+ }
4191
4548
  }
4192
4549
 
4193
4550
  static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
4194
4551
  GGML_ASSERT(ncols % WARP_SIZE == 0);
4195
- const dim3 block_dims(WARP_SIZE, 1, 1);
4196
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4552
+ if (ncols < 1024) {
4553
+ const dim3 block_dims(WARP_SIZE, 1, 1);
4554
+ rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4555
+ } else {
4556
+ const dim3 block_dims(1024, 1, 1);
4557
+ rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
4558
+ }
4197
4559
  }
4198
4560
 
4199
4561
  static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@@ -4498,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
4498
4860
  const int compute_capability = g_compute_capabilities[id];
4499
4861
 
4500
4862
  int mmq_x, mmq_y, nwarps;
4501
- if (compute_capability >= CC_TURING) {
4863
+ if (compute_capability >= CC_RDNA2) {
4864
+ mmq_x = MMQ_X_Q4_0_RDNA2;
4865
+ mmq_y = MMQ_Y_Q4_0_RDNA2;
4866
+ nwarps = NWARPS_Q4_0_RDNA2;
4867
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4868
+ mmq_x = MMQ_X_Q4_0_RDNA1;
4869
+ mmq_y = MMQ_Y_Q4_0_RDNA1;
4870
+ nwarps = NWARPS_Q4_0_RDNA1;
4871
+ } else if (compute_capability >= CC_TURING) {
4502
4872
  mmq_x = MMQ_X_Q4_0_AMPERE;
4503
4873
  mmq_y = MMQ_Y_Q4_0_AMPERE;
4504
4874
  nwarps = NWARPS_Q4_0_AMPERE;
@@ -4535,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
4535
4905
  const int compute_capability = g_compute_capabilities[id];
4536
4906
 
4537
4907
  int mmq_x, mmq_y, nwarps;
4538
- if (compute_capability >= CC_TURING) {
4908
+ if (compute_capability >= CC_RDNA2) {
4909
+ mmq_x = MMQ_X_Q4_1_RDNA2;
4910
+ mmq_y = MMQ_Y_Q4_1_RDNA2;
4911
+ nwarps = NWARPS_Q4_1_RDNA2;
4912
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4913
+ mmq_x = MMQ_X_Q4_1_RDNA1;
4914
+ mmq_y = MMQ_Y_Q4_1_RDNA1;
4915
+ nwarps = NWARPS_Q4_1_RDNA1;
4916
+ } else if (compute_capability >= CC_TURING) {
4539
4917
  mmq_x = MMQ_X_Q4_1_AMPERE;
4540
4918
  mmq_y = MMQ_Y_Q4_1_AMPERE;
4541
4919
  nwarps = NWARPS_Q4_1_AMPERE;
@@ -4572,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
4572
4950
  const int compute_capability = g_compute_capabilities[id];
4573
4951
 
4574
4952
  int mmq_x, mmq_y, nwarps;
4575
- if (compute_capability >= CC_TURING) {
4953
+ if (compute_capability >= CC_RDNA2) {
4954
+ mmq_x = MMQ_X_Q5_0_RDNA2;
4955
+ mmq_y = MMQ_Y_Q5_0_RDNA2;
4956
+ nwarps = NWARPS_Q5_0_RDNA2;
4957
+ } else if (compute_capability >= CC_OFFSET_AMD) {
4958
+ mmq_x = MMQ_X_Q5_0_RDNA1;
4959
+ mmq_y = MMQ_Y_Q5_0_RDNA1;
4960
+ nwarps = NWARPS_Q5_0_RDNA1;
4961
+ } else if (compute_capability >= CC_TURING) {
4576
4962
  mmq_x = MMQ_X_Q5_0_AMPERE;
4577
4963
  mmq_y = MMQ_Y_Q5_0_AMPERE;
4578
4964
  nwarps = NWARPS_Q5_0_AMPERE;
@@ -4609,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
4609
4995
  const int compute_capability = g_compute_capabilities[id];
4610
4996
 
4611
4997
  int mmq_x, mmq_y, nwarps;
4612
- if (compute_capability >= CC_TURING) {
4998
+ if (compute_capability >= CC_RDNA2) {
4999
+ mmq_x = MMQ_X_Q5_1_RDNA2;
5000
+ mmq_y = MMQ_Y_Q5_1_RDNA2;
5001
+ nwarps = NWARPS_Q5_1_RDNA2;
5002
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5003
+ mmq_x = MMQ_X_Q5_1_RDNA1;
5004
+ mmq_y = MMQ_Y_Q5_1_RDNA1;
5005
+ nwarps = NWARPS_Q5_1_RDNA1;
5006
+ } else if (compute_capability >= CC_TURING) {
4613
5007
  mmq_x = MMQ_X_Q5_1_AMPERE;
4614
5008
  mmq_y = MMQ_Y_Q5_1_AMPERE;
4615
5009
  nwarps = NWARPS_Q5_1_AMPERE;
@@ -4646,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
4646
5040
  const int compute_capability = g_compute_capabilities[id];
4647
5041
 
4648
5042
  int mmq_x, mmq_y, nwarps;
4649
- if (compute_capability >= CC_TURING) {
5043
+ if (compute_capability >= CC_RDNA2) {
5044
+ mmq_x = MMQ_X_Q8_0_RDNA2;
5045
+ mmq_y = MMQ_Y_Q8_0_RDNA2;
5046
+ nwarps = NWARPS_Q8_0_RDNA2;
5047
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5048
+ mmq_x = MMQ_X_Q8_0_RDNA1;
5049
+ mmq_y = MMQ_Y_Q8_0_RDNA1;
5050
+ nwarps = NWARPS_Q8_0_RDNA1;
5051
+ } else if (compute_capability >= CC_TURING) {
4650
5052
  mmq_x = MMQ_X_Q8_0_AMPERE;
4651
5053
  mmq_y = MMQ_Y_Q8_0_AMPERE;
4652
5054
  nwarps = NWARPS_Q8_0_AMPERE;
@@ -4683,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
4683
5085
  const int compute_capability = g_compute_capabilities[id];
4684
5086
 
4685
5087
  int mmq_x, mmq_y, nwarps;
4686
- if (compute_capability >= CC_TURING) {
5088
+ if (compute_capability >= CC_RDNA2) {
5089
+ mmq_x = MMQ_X_Q2_K_RDNA2;
5090
+ mmq_y = MMQ_Y_Q2_K_RDNA2;
5091
+ nwarps = NWARPS_Q2_K_RDNA2;
5092
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5093
+ mmq_x = MMQ_X_Q2_K_RDNA1;
5094
+ mmq_y = MMQ_Y_Q2_K_RDNA1;
5095
+ nwarps = NWARPS_Q2_K_RDNA1;
5096
+ } else if (compute_capability >= CC_TURING) {
4687
5097
  mmq_x = MMQ_X_Q2_K_AMPERE;
4688
5098
  mmq_y = MMQ_Y_Q2_K_AMPERE;
4689
5099
  nwarps = NWARPS_Q2_K_AMPERE;
@@ -4722,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4722
5132
  const int compute_capability = g_compute_capabilities[id];
4723
5133
 
4724
5134
  int mmq_x, mmq_y, nwarps;
4725
- if (compute_capability >= CC_TURING) {
5135
+ if (compute_capability >= CC_RDNA2) {
5136
+ mmq_x = MMQ_X_Q3_K_RDNA2;
5137
+ mmq_y = MMQ_Y_Q3_K_RDNA2;
5138
+ nwarps = NWARPS_Q3_K_RDNA2;
5139
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5140
+ mmq_x = MMQ_X_Q3_K_RDNA1;
5141
+ mmq_y = MMQ_Y_Q3_K_RDNA1;
5142
+ nwarps = NWARPS_Q3_K_RDNA1;
5143
+ } else if (compute_capability >= CC_TURING) {
4726
5144
  mmq_x = MMQ_X_Q3_K_AMPERE;
4727
5145
  mmq_y = MMQ_Y_Q3_K_AMPERE;
4728
5146
  nwarps = NWARPS_Q3_K_AMPERE;
@@ -4760,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
4760
5178
  const int compute_capability = g_compute_capabilities[id];
4761
5179
 
4762
5180
  int mmq_x, mmq_y, nwarps;
4763
- if (compute_capability >= CC_TURING) {
5181
+ if (compute_capability >= CC_RDNA2) {
5182
+ mmq_x = MMQ_X_Q4_K_RDNA2;
5183
+ mmq_y = MMQ_Y_Q4_K_RDNA2;
5184
+ nwarps = NWARPS_Q4_K_RDNA2;
5185
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5186
+ mmq_x = MMQ_X_Q4_K_RDNA1;
5187
+ mmq_y = MMQ_Y_Q4_K_RDNA1;
5188
+ nwarps = NWARPS_Q4_K_RDNA1;
5189
+ } else if (compute_capability >= CC_TURING) {
4764
5190
  mmq_x = MMQ_X_Q4_K_AMPERE;
4765
5191
  mmq_y = MMQ_Y_Q4_K_AMPERE;
4766
5192
  nwarps = NWARPS_Q4_K_AMPERE;
@@ -4797,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
4797
5223
  const int compute_capability = g_compute_capabilities[id];
4798
5224
 
4799
5225
  int mmq_x, mmq_y, nwarps;
4800
- if (compute_capability >= CC_TURING) {
5226
+ if (compute_capability >= CC_RDNA2) {
5227
+ mmq_x = MMQ_X_Q5_K_RDNA2;
5228
+ mmq_y = MMQ_Y_Q5_K_RDNA2;
5229
+ nwarps = NWARPS_Q5_K_RDNA2;
5230
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5231
+ mmq_x = MMQ_X_Q5_K_RDNA1;
5232
+ mmq_y = MMQ_Y_Q5_K_RDNA1;
5233
+ nwarps = NWARPS_Q5_K_RDNA1;
5234
+ } else if (compute_capability >= CC_TURING) {
4801
5235
  mmq_x = MMQ_X_Q5_K_AMPERE;
4802
5236
  mmq_y = MMQ_Y_Q5_K_AMPERE;
4803
5237
  nwarps = NWARPS_Q5_K_AMPERE;
@@ -4834,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
4834
5268
  const int compute_capability = g_compute_capabilities[id];
4835
5269
 
4836
5270
  int mmq_x, mmq_y, nwarps;
4837
- if (compute_capability >= CC_TURING) {
5271
+ if (compute_capability >= CC_RDNA2) {
5272
+ mmq_x = MMQ_X_Q6_K_RDNA2;
5273
+ mmq_y = MMQ_Y_Q6_K_RDNA2;
5274
+ nwarps = NWARPS_Q6_K_RDNA2;
5275
+ } else if (compute_capability >= CC_OFFSET_AMD) {
5276
+ mmq_x = MMQ_X_Q6_K_RDNA1;
5277
+ mmq_y = MMQ_Y_Q6_K_RDNA1;
5278
+ nwarps = NWARPS_Q6_K_RDNA1;
5279
+ } else if (compute_capability >= CC_TURING) {
4838
5280
  mmq_x = MMQ_X_Q6_K_AMPERE;
4839
5281
  mmq_y = MMQ_Y_Q6_K_AMPERE;
4840
5282
  nwarps = NWARPS_Q6_K_AMPERE;
@@ -4924,12 +5366,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
4924
5366
  rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4925
5367
  }
4926
5368
 
4927
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4928
- GGML_ASSERT(nrows % 4 == 0);
4929
- const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4930
- const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
5369
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
+ const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
+ GGML_ASSERT(ncols % 4 == 0);
5372
+ const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
+ const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
4931
5374
  const dim3 block_nums(num_blocks_x, nrows, 1);
4932
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
5375
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
4933
5376
  }
4934
5377
 
4935
5378
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5067,25 +5510,30 @@ void ggml_init_cublas() {
5067
5510
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
5068
5511
  int64_t total_vram = 0;
5069
5512
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
5070
- for (int id = 0; id < g_device_count; ++id) {
5513
+ for (int64_t id = 0; id < g_device_count; ++id) {
5071
5514
  cudaDeviceProp prop;
5072
5515
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
5073
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5516
+ fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
5074
5517
 
5075
5518
  g_tensor_split[id] = total_vram;
5076
5519
  total_vram += prop.totalGlobalMem;
5077
-
5520
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5521
+ g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
5522
+ #else
5078
5523
  g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
5524
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5079
5525
  }
5080
- for (int id = 0; id < g_device_count; ++id) {
5526
+ for (int64_t id = 0; id < g_device_count; ++id) {
5081
5527
  g_tensor_split[id] /= total_vram;
5082
5528
  }
5083
5529
 
5084
- for (int id = 0; id < g_device_count; ++id) {
5085
- CUDA_CHECK(cudaSetDevice(id));
5530
+ for (int64_t id = 0; id < g_device_count; ++id) {
5531
+ CUDA_CHECK(ggml_cuda_set_device(id));
5086
5532
 
5087
- // create main stream
5088
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
5533
+ // create cuda streams
5534
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
5535
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
5536
+ }
5089
5537
 
5090
5538
  // create cublas handle
5091
5539
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -5154,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5154
5602
  if (src->backend == GGML_BACKEND_CPU) {
5155
5603
  kind = cudaMemcpyHostToDevice;
5156
5604
  src_ptr = (char *) src->data;
5157
- } else if (src->backend == GGML_BACKEND_GPU) {
5605
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5606
+ GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
5158
5607
  kind = cudaMemcpyDeviceToDevice;
5159
5608
  struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
5160
5609
  int id;
@@ -5193,236 +5642,205 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5193
5642
  }
5194
5643
 
5195
5644
  inline void ggml_cuda_op_add(
5196
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5197
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5198
- cudaStream_t & cudaStream_main){
5199
-
5200
- GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
5201
- GGML_ASSERT(src1_ddf_i != nullptr);
5202
- GGML_ASSERT(dst_ddf_i != nullptr);
5645
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5646
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5203
5647
 
5204
- const int64_t ne00 = src0->ne[0];
5205
- const int64_t i01_diff = i01_high - i01_low;
5648
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5206
5649
 
5207
5650
  const int64_t ne10 = src1->ne[0];
5208
5651
  const int64_t ne11 = src1->ne[1];
5209
5652
 
5210
- // compute
5211
5653
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
5212
- add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5654
+ add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5213
5655
  } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
5214
- add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
5656
+ add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
5215
5657
  } else {
5216
5658
  GGML_ASSERT(false);
5217
5659
  }
5218
5660
 
5219
5661
  (void) src1;
5220
5662
  (void) dst;
5221
- (void) src0_ddq_i;
5222
- (void) i02;
5223
- (void) i1;
5224
5663
  }
5225
5664
 
5226
5665
  inline void ggml_cuda_op_mul(
5227
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5228
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5229
- cudaStream_t & cudaStream_main){
5230
-
5231
- GGML_ASSERT(src0_ddf_i != nullptr);
5232
- GGML_ASSERT(src1_ddf_i != nullptr);
5233
- GGML_ASSERT(dst_ddf_i != nullptr);
5666
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5667
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5234
5668
 
5235
- const int64_t ne00 = src0->ne[0];
5236
- const int64_t i01_diff = i01_high - i01_low;
5669
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5670
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
5671
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5237
5672
 
5238
5673
  const int64_t ne10 = src1->ne[0];
5239
5674
  const int64_t ne11 = src1->ne[1];
5240
5675
 
5241
- mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
5676
+ mul_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
5242
5677
 
5243
5678
  (void) dst;
5244
- (void) src0_ddq_i;
5245
- (void) i02;
5246
- (void) i1;
5247
5679
  }
5248
5680
 
5249
5681
  inline void ggml_cuda_op_gelu(
5250
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5251
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5252
- cudaStream_t & cudaStream_main){
5253
-
5254
- GGML_ASSERT(src0_ddf_i != nullptr);
5255
- GGML_ASSERT(dst_ddf_i != nullptr);
5682
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5683
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5256
5684
 
5257
- const int64_t ne00 = src0->ne[0];
5258
- const int64_t i01_diff = i01_high - i01_low;
5685
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5686
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5259
5687
 
5260
- // compute
5261
- gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5688
+ gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5262
5689
 
5263
5690
  (void) src1;
5264
5691
  (void) dst;
5265
- (void) src0_ddq_i;
5266
- (void) src1_ddf_i;
5267
- (void) i02;
5268
- (void) i1;
5692
+ (void) src1_dd;
5269
5693
  }
5270
5694
 
5271
5695
  inline void ggml_cuda_op_silu(
5272
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5273
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5274
- cudaStream_t & cudaStream_main){
5696
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5697
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5275
5698
 
5276
- GGML_ASSERT(src0_ddf_i != nullptr);
5277
- GGML_ASSERT(dst_ddf_i != nullptr);
5699
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5700
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5278
5701
 
5279
- const int64_t ne00 = src0->ne[0];
5280
- const int64_t i01_diff = i01_high - i01_low;
5281
-
5282
- // compute
5283
- silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
5702
+ silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
5284
5703
 
5285
5704
  (void) src1;
5286
5705
  (void) dst;
5287
- (void) src0_ddq_i;
5288
- (void) src1_ddf_i;
5289
- (void) i02;
5290
- (void) i1;
5706
+ (void) src1_dd;
5291
5707
  }
5292
5708
 
5293
5709
  inline void ggml_cuda_op_norm(
5294
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5295
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5296
- cudaStream_t & cudaStream_main){
5710
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5711
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5297
5712
 
5298
- GGML_ASSERT(src0_ddf_i != nullptr);
5299
- GGML_ASSERT(dst_ddf_i != nullptr);
5713
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5714
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5300
5715
 
5301
5716
  const int64_t ne00 = src0->ne[0];
5302
- const int64_t i01_diff = i01_high - i01_low;
5717
+ const int64_t nrows = ggml_nrows(src0);
5303
5718
 
5304
- // compute
5305
- norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
5719
+ norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5306
5720
 
5307
5721
  (void) src1;
5308
5722
  (void) dst;
5309
- (void) src0_ddq_i;
5310
- (void) src1_ddf_i;
5311
- (void) i02;
5312
- (void) i1;
5723
+ (void) src1_dd;
5313
5724
  }
5314
5725
 
5315
5726
  inline void ggml_cuda_op_rms_norm(
5316
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5317
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5318
- cudaStream_t & cudaStream_main){
5727
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5728
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5319
5729
 
5320
- GGML_ASSERT(src0_ddf_i != nullptr);
5321
- GGML_ASSERT(dst_ddf_i != nullptr);
5730
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
5731
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5322
5732
 
5323
5733
  const int64_t ne00 = src0->ne[0];
5324
- const int64_t i01_diff = i01_high - i01_low;
5734
+ const int64_t nrows = ggml_nrows(src0);
5325
5735
 
5326
5736
  float eps;
5327
5737
  memcpy(&eps, dst->op_params, sizeof(float));
5328
5738
 
5329
- // compute
5330
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
5739
+ rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
5331
5740
 
5332
5741
  (void) src1;
5333
5742
  (void) dst;
5334
- (void) src0_ddq_i;
5335
- (void) src1_ddf_i;
5336
- (void) i02;
5337
- (void) i1;
5743
+ (void) src1_dd;
5338
5744
  }
5339
5745
 
5340
5746
  inline void ggml_cuda_op_mul_mat_q(
5341
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5342
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5343
- cudaStream_t & cudaStream_main){
5344
-
5345
- GGML_ASSERT(src0_ddq_i != nullptr);
5346
- GGML_ASSERT(src1_ddf_i != nullptr);
5347
- GGML_ASSERT(dst_ddf_i != nullptr);
5747
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5748
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5749
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5348
5750
 
5349
5751
  const int64_t ne00 = src0->ne[0];
5350
5752
 
5351
5753
  const int64_t ne10 = src1->ne[0];
5352
- const int64_t ne11 = src1->ne[1];
5353
5754
  GGML_ASSERT(ne10 % QK8_1 == 0);
5354
5755
 
5355
5756
  const int64_t ne0 = dst->ne[0];
5356
5757
 
5357
- const int64_t i01_diff = i01_high - i01_low;
5758
+ const int64_t row_diff = row_high - row_low;
5358
5759
 
5359
5760
  int id;
5360
5761
  CUDA_CHECK(cudaGetDevice(&id));
5361
5762
 
5362
5763
  // the main device has a larger memory buffer to hold the results from all GPUs
5363
5764
  // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5364
- const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5365
-
5366
- const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5367
- ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5368
- size_t as;
5369
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*sizeof(block_q8_1)/QK8_1, &as);
5370
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, cudaStream_main);
5765
+ const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5371
5766
 
5372
5767
  switch (src0->type) {
5373
5768
  case GGML_TYPE_Q4_0:
5374
- ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5769
+ ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5375
5770
  break;
5376
5771
  case GGML_TYPE_Q4_1:
5377
- ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5772
+ ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5378
5773
  break;
5379
5774
  case GGML_TYPE_Q5_0:
5380
- ggml_mul_mat_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5775
+ ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5381
5776
  break;
5382
5777
  case GGML_TYPE_Q5_1:
5383
- ggml_mul_mat_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5778
+ ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5384
5779
  break;
5385
5780
  case GGML_TYPE_Q8_0:
5386
- ggml_mul_mat_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5781
+ ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5387
5782
  break;
5388
5783
  case GGML_TYPE_Q2_K:
5389
- ggml_mul_mat_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5784
+ ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5390
5785
  break;
5391
5786
  case GGML_TYPE_Q3_K:
5392
- ggml_mul_mat_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5787
+ ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5393
5788
  break;
5394
5789
  case GGML_TYPE_Q4_K:
5395
- ggml_mul_mat_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5790
+ ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5396
5791
  break;
5397
5792
  case GGML_TYPE_Q5_K:
5398
- ggml_mul_mat_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5793
+ ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5399
5794
  break;
5400
5795
  case GGML_TYPE_Q6_K:
5401
- ggml_mul_mat_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, padded_row_size, nrows_dst, cudaStream_main);
5796
+ ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
5402
5797
  break;
5403
5798
  default:
5404
5799
  GGML_ASSERT(false);
5405
5800
  break;
5406
5801
  }
5407
5802
 
5408
- ggml_cuda_pool_free(src1_q8_1, as);
5409
-
5410
5803
  (void) src1;
5411
5804
  (void) dst;
5412
- (void) src0_ddf_i;
5413
- (void) i02;
5414
- (void) i1;
5805
+ (void) src1_ddf_i;
5415
5806
  }
5416
5807
 
5417
5808
  static int64_t get_row_rounding(ggml_type type) {
5418
- int max_compute_capability = INT_MIN;
5419
- for (int id = 0; id < g_device_count; ++id) {
5420
- if (max_compute_capability < g_compute_capabilities[id]
5421
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5422
- max_compute_capability = g_compute_capabilities[id];
5809
+ int64_t min_compute_capability = INT_MAX;
5810
+ int64_t max_compute_capability = INT_MIN;
5811
+ for (int64_t id = 0; id < g_device_count; ++id) {
5812
+ if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
5813
+ if (min_compute_capability > g_compute_capabilities[id]) {
5814
+ min_compute_capability = g_compute_capabilities[id];
5815
+ }
5816
+ if (max_compute_capability < g_compute_capabilities[id]) {
5817
+ max_compute_capability = g_compute_capabilities[id];
5818
+ }
5423
5819
  }
5424
5820
  }
5425
5821
 
5822
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5823
+ switch(type) {
5824
+ case GGML_TYPE_Q4_0:
5825
+ case GGML_TYPE_Q4_1:
5826
+ case GGML_TYPE_Q5_0:
5827
+ case GGML_TYPE_Q5_1:
5828
+ case GGML_TYPE_Q8_0:
5829
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5830
+ case GGML_TYPE_F16:
5831
+ return 1;
5832
+ case GGML_TYPE_Q2_K:
5833
+ return max_compute_capability >= CC_RDNA2 ? 128 : 32;
5834
+ case GGML_TYPE_Q3_K:
5835
+ return min_compute_capability < CC_RDNA2 ? 128 : 64;
5836
+ case GGML_TYPE_Q4_K:
5837
+ case GGML_TYPE_Q5_K:
5838
+ case GGML_TYPE_Q6_K:
5839
+ return max_compute_capability >= CC_RDNA2 ? 128 : 64;
5840
+ default:
5841
+ GGML_ASSERT(false);
5842
+ }
5843
+ #else
5426
5844
  switch(type) {
5427
5845
  case GGML_TYPE_Q4_0:
5428
5846
  case GGML_TYPE_Q4_1:
@@ -5443,170 +5861,147 @@ static int64_t get_row_rounding(ggml_type type) {
5443
5861
  default:
5444
5862
  GGML_ASSERT(false);
5445
5863
  }
5864
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
5446
5865
  }
5447
5866
 
5448
- inline void ggml_cuda_op_mul_mat_vec(
5449
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5450
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5451
- cudaStream_t & cudaStream_main){
5452
-
5453
- GGML_ASSERT(src0_ddq_i != nullptr);
5454
- GGML_ASSERT(src1_ddf_i != nullptr);
5455
- GGML_ASSERT(dst_ddf_i != nullptr);
5867
+ inline void ggml_cuda_op_mul_mat_vec_q(
5868
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5869
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5870
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5456
5871
 
5457
5872
  const int64_t ne00 = src0->ne[0];
5458
- const int64_t nrows = i01_high - i01_low;
5873
+ const int64_t row_diff = row_high - row_low;
5459
5874
 
5460
- #ifdef GGML_CUDA_FORCE_DMMV
5461
- const bool use_mul_mat_vec_q = false;
5462
- (void) g_compute_capabilities[0];
5463
- #else
5464
- int id;
5465
- CUDA_CHECK(cudaGetDevice(&id));
5875
+ switch (src0->type) {
5876
+ case GGML_TYPE_Q4_0:
5877
+ mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5878
+ break;
5879
+ case GGML_TYPE_Q4_1:
5880
+ mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5881
+ break;
5882
+ case GGML_TYPE_Q5_0:
5883
+ mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5884
+ break;
5885
+ case GGML_TYPE_Q5_1:
5886
+ mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5887
+ break;
5888
+ case GGML_TYPE_Q8_0:
5889
+ mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5890
+ break;
5891
+ case GGML_TYPE_Q2_K:
5892
+ mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5893
+ break;
5894
+ case GGML_TYPE_Q3_K:
5895
+ mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5896
+ break;
5897
+ case GGML_TYPE_Q4_K:
5898
+ mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5899
+ break;
5900
+ case GGML_TYPE_Q5_K:
5901
+ mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5902
+ break;
5903
+ case GGML_TYPE_Q6_K:
5904
+ mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
5905
+ break;
5906
+ default:
5907
+ GGML_ASSERT(false);
5908
+ break;
5909
+ }
5466
5910
 
5467
- bool mul_mat_vec_q_implemented =
5468
- src0->type == GGML_TYPE_Q4_0 ||
5469
- src0->type == GGML_TYPE_Q4_1 ||
5470
- src0->type == GGML_TYPE_Q5_0 ||
5471
- src0->type == GGML_TYPE_Q5_1 ||
5472
- src0->type == GGML_TYPE_Q8_0;
5473
- #if QK_K == 256
5474
- mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
5475
- src0->type == GGML_TYPE_Q2_K ||
5476
- src0->type == GGML_TYPE_Q3_K ||
5477
- src0->type == GGML_TYPE_Q4_K ||
5478
- src0->type == GGML_TYPE_Q5_K ||
5479
- src0->type == GGML_TYPE_Q6_K;
5480
- #endif // QK_K == 256
5481
-
5482
- const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
5483
- #endif
5911
+ (void) src1;
5912
+ (void) dst;
5913
+ (void) src1_ddf_i;
5914
+ (void) src1_ncols;
5915
+ (void) src1_padded_row_size;
5916
+ }
5484
5917
 
5485
- if (use_mul_mat_vec_q) {
5486
- const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
5487
- ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5488
- size_t as;
5489
- void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
5490
- quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, 1, padded_row_size, cudaStream_main);
5491
-
5492
- switch (src0->type) {
5493
- case GGML_TYPE_Q4_0:
5494
- mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5495
- break;
5496
- case GGML_TYPE_Q4_1:
5497
- mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5498
- break;
5499
- case GGML_TYPE_Q5_0:
5500
- mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5501
- break;
5502
- case GGML_TYPE_Q5_1:
5503
- mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5504
- break;
5505
- case GGML_TYPE_Q8_0:
5506
- mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5507
- break;
5508
- case GGML_TYPE_Q2_K:
5509
- mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5510
- break;
5511
- case GGML_TYPE_Q3_K:
5512
- mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5513
- break;
5514
- case GGML_TYPE_Q4_K:
5515
- mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5516
- break;
5517
- case GGML_TYPE_Q5_K:
5518
- mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5519
- break;
5520
- case GGML_TYPE_Q6_K:
5521
- mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
5522
- break;
5523
- default:
5524
- GGML_ASSERT(false);
5525
- break;
5526
- }
5918
+ inline void ggml_cuda_op_dequantize_mul_mat_vec(
5919
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5920
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
5921
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5527
5922
 
5528
- ggml_cuda_pool_free(src1_q8_1, as);
5529
- } else {
5530
- // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5923
+ const int64_t ne00 = src0->ne[0];
5924
+ const int64_t row_diff = row_high - row_low;
5925
+
5926
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5531
5927
  #ifdef GGML_CUDA_F16
5532
- size_t ash;
5533
- dfloat * src1_dfloat = nullptr; // dfloat == half
5534
-
5535
- bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5536
- src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5537
- src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5538
-
5539
- if (src1_convert_f16) {
5540
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5541
- ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5542
- ne00, 1, sizeof(float), 0, 0,
5543
- ne00, 1, sizeof(half), 0, 0, cudaStream_main);
5544
- }
5928
+ size_t ash;
5929
+ dfloat * src1_dfloat = nullptr; // dfloat == half
5930
+
5931
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
5932
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
5933
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
5934
+
5935
+ if (src1_convert_f16) {
5936
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
5937
+ ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
5938
+ ne00, 1, sizeof(float), 0, 0,
5939
+ ne00, 1, sizeof(half), 0, 0, stream);
5940
+ }
5545
5941
  #else
5546
- dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
5942
+ const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
5547
5943
  #endif // GGML_CUDA_F16
5548
5944
 
5549
- switch (src0->type) {
5550
- case GGML_TYPE_Q4_0:
5551
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5552
- break;
5553
- case GGML_TYPE_Q4_1:
5554
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5555
- break;
5556
- case GGML_TYPE_Q5_0:
5557
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5558
- break;
5559
- case GGML_TYPE_Q5_1:
5560
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5561
- break;
5562
- case GGML_TYPE_Q8_0:
5563
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5564
- break;
5565
- case GGML_TYPE_Q2_K:
5566
- dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5567
- break;
5568
- case GGML_TYPE_Q3_K:
5569
- dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5570
- break;
5571
- case GGML_TYPE_Q4_K:
5572
- dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5573
- break;
5574
- case GGML_TYPE_Q5_K:
5575
- dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5576
- break;
5577
- case GGML_TYPE_Q6_K:
5578
- dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
5579
- break;
5580
- case GGML_TYPE_F16:
5581
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
5582
- break;
5583
- default:
5584
- GGML_ASSERT(false);
5585
- break;
5586
- }
5945
+ switch (src0->type) {
5946
+ case GGML_TYPE_Q4_0:
5947
+ dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5948
+ break;
5949
+ case GGML_TYPE_Q4_1:
5950
+ dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5951
+ break;
5952
+ case GGML_TYPE_Q5_0:
5953
+ dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5954
+ break;
5955
+ case GGML_TYPE_Q5_1:
5956
+ dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5957
+ break;
5958
+ case GGML_TYPE_Q8_0:
5959
+ dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5960
+ break;
5961
+ case GGML_TYPE_Q2_K:
5962
+ dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5963
+ break;
5964
+ case GGML_TYPE_Q3_K:
5965
+ dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5966
+ break;
5967
+ case GGML_TYPE_Q4_K:
5968
+ dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5969
+ break;
5970
+ case GGML_TYPE_Q5_K:
5971
+ dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5972
+ break;
5973
+ case GGML_TYPE_Q6_K:
5974
+ dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
5975
+ break;
5976
+ case GGML_TYPE_F16:
5977
+ convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
5978
+ break;
5979
+ default:
5980
+ GGML_ASSERT(false);
5981
+ break;
5982
+ }
5587
5983
 
5588
5984
  #ifdef GGML_CUDA_F16
5589
- if (src1_convert_f16) {
5590
- ggml_cuda_pool_free(src1_dfloat, ash);
5591
- }
5592
- #endif // GGML_CUDA_F16
5985
+ if (src1_convert_f16) {
5986
+ ggml_cuda_pool_free(src1_dfloat, ash);
5593
5987
  }
5988
+ #endif // GGML_CUDA_F16
5594
5989
 
5595
5990
  (void) src1;
5596
5991
  (void) dst;
5597
- (void) src0_ddf_i;
5598
- (void) i02;
5599
- (void) i1;
5992
+ (void) src1_ddq_i;
5993
+ (void) src1_ncols;
5994
+ (void) src1_padded_row_size;
5600
5995
  }
5601
5996
 
5602
5997
  inline void ggml_cuda_op_mul_mat_cublas(
5603
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5604
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5605
- cudaStream_t & cudaStream_main){
5998
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
5999
+ const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
6000
+ const int64_t src1_padded_row_size, const cudaStream_t & stream) {
5606
6001
 
5607
- GGML_ASSERT(src0_ddf_i != nullptr);
6002
+ GGML_ASSERT(src0_dd_i != nullptr);
5608
6003
  GGML_ASSERT(src1_ddf_i != nullptr);
5609
- GGML_ASSERT(dst_ddf_i != nullptr);
6004
+ GGML_ASSERT(dst_dd_i != nullptr);
5610
6005
 
5611
6006
  const float alpha = 1.0f;
5612
6007
  const float beta = 0.0f;
@@ -5614,43 +6009,54 @@ inline void ggml_cuda_op_mul_mat_cublas(
5614
6009
  const int64_t ne00 = src0->ne[0];
5615
6010
 
5616
6011
  const int64_t ne10 = src1->ne[0];
5617
- const int64_t ne11 = src1->ne[1];
5618
6012
 
5619
6013
  const int64_t ne0 = dst->ne[0];
5620
- const int64_t i01_diff = i01_high - i01_low;
6014
+ const int64_t row_diff = row_high - row_low;
6015
+
6016
+ float * src0_ddq_as_f32;
6017
+ size_t src0_as = 0;
6018
+
6019
+ if (src0->type != GGML_TYPE_F32) {
6020
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
+ }
6024
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5621
6025
 
5622
6026
  int id;
5623
6027
  CUDA_CHECK(cudaGetDevice(&id));
5624
6028
 
5625
6029
  // the main device has a larger memory buffer to hold the results from all GPUs
5626
6030
  // ldc == nrows of the matrix that cuBLAS writes into
5627
- int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
6031
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
5628
6032
 
5629
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
6033
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
5630
6034
  CUBLAS_CHECK(
5631
6035
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
5632
- i01_diff, ne11, ne10,
6036
+ row_diff, src1_ncols, ne10,
5633
6037
  &alpha, src0_ddf_i, ne00,
5634
- src1_ddf_i, ne10,
5635
- &beta, dst_ddf_i, ldc));
6038
+ src1_ddf_i, ne10,
6039
+ &beta, dst_dd_i, ldc));
6040
+
6041
+ if (src0_as > 0) {
6042
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6043
+ }
5636
6044
 
5637
6045
  (void) dst;
5638
- (void) src0_ddq_i;
5639
- (void) i02;
5640
- (void) i1;
6046
+ (void) src1_ddq_i;
6047
+ (void) src1_padded_row_size;
5641
6048
  }
5642
6049
 
5643
6050
  inline void ggml_cuda_op_rope(
5644
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5645
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5646
- cudaStream_t & cudaStream_main){
6051
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5647
6053
 
5648
- GGML_ASSERT(src0_ddf_i != nullptr);
5649
- GGML_ASSERT(dst_ddf_i != nullptr);
6054
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5650
6056
 
5651
6057
  const int64_t ne00 = src0->ne[0];
5652
6058
  const int64_t ne01 = src0->ne[1];
5653
- const int64_t i01_diff = i01_high - i01_low;
6059
+ const int64_t nrows = ggml_nrows(src0);
5654
6060
 
5655
6061
  const int n_past = ((int32_t *) dst->op_params)[0];
5656
6062
  const int n_dims = ((int32_t *) dst->op_params)[1];
@@ -5663,44 +6069,37 @@ inline void ggml_cuda_op_rope(
5663
6069
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
5664
6070
 
5665
6071
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5666
6073
 
5667
6074
  const bool is_neox = mode & 2;
5668
6075
  const bool is_glm = mode & 4;
5669
6076
 
5670
6077
  // compute
5671
6078
  if (is_glm) {
5672
- const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
5673
- const float id_p = min(p, n_ctx - 2.f);
5674
- const float block_p = max(p - (n_ctx - 2.f), 0.f);
5675
- rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
6079
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
5676
6080
  } else if (is_neox) {
5677
6081
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5678
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5679
- rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6082
+ rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5680
6083
  } else {
5681
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5682
- rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
6084
+ rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
5683
6085
  }
5684
6086
 
5685
6087
  (void) src1;
5686
6088
  (void) dst;
5687
- (void) src0_ddq_i;
5688
- (void) src1_ddf_i;
5689
- (void) i1;
6089
+ (void) src1_dd;
5690
6090
  }
5691
6091
 
5692
6092
  inline void ggml_cuda_op_alibi(
5693
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5694
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5695
- cudaStream_t & cudaStream_main){
6093
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6094
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5696
6095
 
5697
- GGML_ASSERT(src0_ddf_i != nullptr);
5698
- GGML_ASSERT(dst_ddf_i != nullptr);
6096
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6097
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5699
6098
 
5700
6099
  const int64_t ne00 = src0->ne[0];
5701
6100
  const int64_t ne01 = src0->ne[1];
5702
6101
  const int64_t ne02 = src0->ne[2];
5703
- const int64_t i01_diff = i01_high - i01_low;
6102
+ const int64_t nrows = ggml_nrows(src0);
5704
6103
 
5705
6104
  const int n_past = ((int32_t *) dst->op_params)[0];
5706
6105
  const int n_head = ((int32_t *) dst->op_params)[1];
@@ -5715,334 +6114,354 @@ inline void ggml_cuda_op_alibi(
5715
6114
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5716
6115
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5717
6116
 
5718
- // compute
5719
- alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
6117
+ alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
5720
6118
 
5721
6119
  (void) src1;
5722
- (void) src0_ddq_i;
5723
- (void) src1_ddf_i;
5724
- (void) i1;
6120
+ (void) src1_dd;
5725
6121
  }
5726
6122
 
5727
6123
  inline void ggml_cuda_op_diag_mask_inf(
5728
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5729
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5730
- cudaStream_t & cudaStream_main){
6124
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6125
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5731
6126
 
5732
- GGML_ASSERT(src0_ddf_i != nullptr);
5733
- GGML_ASSERT(dst_ddf_i != nullptr);
6127
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6128
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5734
6129
 
5735
6130
  const int64_t ne00 = src0->ne[0];
5736
6131
  const int64_t ne01 = src0->ne[1];
5737
- const int64_t i01_diff = i01_high - i01_low;
6132
+ const int nrows0 = ggml_nrows(src0);
5738
6133
 
5739
6134
  const int n_past = ((int32_t *) dst->op_params)[0];
5740
6135
 
5741
- // compute
5742
- diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
6136
+ diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
5743
6137
 
5744
6138
  (void) src1;
5745
6139
  (void) dst;
5746
- (void) src0_ddq_i;
5747
- (void) src1_ddf_i;
5748
- (void) i02;
5749
- (void) i1;
6140
+ (void) src1_dd;
5750
6141
  }
5751
6142
 
5752
6143
  inline void ggml_cuda_op_soft_max(
5753
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5754
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5755
- cudaStream_t & cudaStream_main){
6144
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6145
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5756
6146
 
5757
- GGML_ASSERT(src0_ddf_i != nullptr);
5758
- GGML_ASSERT(dst_ddf_i != nullptr);
6147
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6148
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5759
6149
 
5760
6150
  const int64_t ne00 = src0->ne[0];
5761
- const int64_t i01_diff = i01_high - i01_low;
6151
+ const int64_t nrows = ggml_nrows(src0);
5762
6152
 
5763
- // compute
5764
- soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
6153
+ soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
5765
6154
 
5766
6155
  (void) src1;
5767
6156
  (void) dst;
5768
- (void) src0_ddq_i;
5769
- (void) src1_ddf_i;
5770
- (void) i02;
5771
- (void) i1;
6157
+ (void) src1_dd;
5772
6158
  }
5773
6159
 
5774
6160
  inline void ggml_cuda_op_scale(
5775
- const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5776
- float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5777
- cudaStream_t & cudaStream_main){
6161
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6162
+ const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5778
6163
 
5779
- GGML_ASSERT(src0_ddf_i != nullptr);
5780
- GGML_ASSERT(dst_ddf_i != nullptr);
6164
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
6165
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
6166
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
5781
6167
 
5782
6168
  const float scale = ((float *) src1->data)[0];
5783
6169
 
5784
- const int64_t ne00 = src0->ne[0];
5785
- const int64_t i01_diff = i01_high - i01_low;
5786
-
5787
- // compute
5788
- scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main);
6170
+ scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
5789
6171
  CUDA_CHECK(cudaGetLastError());
5790
6172
 
5791
6173
  (void) src1;
5792
6174
  (void) dst;
5793
- (void) src0_ddq_i;
5794
- (void) src1_ddf_i;
5795
- (void) i02;
5796
- (void) i1;
6175
+ (void) src1_dd;
6176
+ }
6177
+
6178
+ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
6179
+ const int64_t nrows0 = ggml_nrows(src0);
6180
+
6181
+ const bool use_src1 = src1 != nullptr;
6182
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6183
+
6184
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6185
+ GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
6186
+
6187
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6188
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
6189
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6190
+
6191
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
6192
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
6193
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
6194
+
6195
+ const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
6196
+
6197
+ // dd = data device
6198
+ float * src0_ddf = nullptr;
6199
+ float * src1_ddf = nullptr;
6200
+ float * dst_ddf = nullptr;
6201
+
6202
+ // as = actual size
6203
+ size_t src0_asf = 0;
6204
+ size_t src1_asf = 0;
6205
+ size_t dst_asf = 0;
6206
+
6207
+ ggml_cuda_set_device(g_main_device);
6208
+ const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6209
+
6210
+ if (src0_on_device) {
6211
+ src0_ddf = (float *) src0_extra->data_device[g_main_device];
6212
+ } else {
6213
+ src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
6214
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
6215
+ }
6216
+
6217
+ if (use_src1 && !src1_stays_on_host) {
6218
+ if (src1_on_device) {
6219
+ src1_ddf = (float *) src1_extra->data_device[g_main_device];
6220
+ } else {
6221
+ src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
6222
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
6223
+ }
6224
+ }
6225
+ if (dst_on_device) {
6226
+ dst_ddf = (float *) dst_extra->data_device[g_main_device];
6227
+ } else {
6228
+ dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
6229
+ }
6230
+
6231
+ // do the computation
6232
+ op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
6233
+ CUDA_CHECK(cudaGetLastError());
6234
+
6235
+ // copy dst to host if necessary
6236
+ if (!dst_on_device) {
6237
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
6238
+ }
6239
+
6240
+ if (src0_asf > 0) {
6241
+ ggml_cuda_pool_free(src0_ddf, src0_asf);
6242
+ }
6243
+ if (src1_asf > 0) {
6244
+ ggml_cuda_pool_free(src1_ddf, src1_asf);
6245
+ }
6246
+ if (dst_asf > 0) {
6247
+ ggml_cuda_pool_free(dst_ddf, dst_asf);
6248
+ }
6249
+
6250
+ if (dst->backend == GGML_BACKEND_CPU) {
6251
+ CUDA_CHECK(cudaDeviceSynchronize());
6252
+ }
5797
6253
  }
5798
6254
 
5799
- static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5800
- ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
6255
+ static void ggml_cuda_op_mul_mat(
6256
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
+ const bool convert_src1_to_q8_1) {
6258
+
5801
6259
  const int64_t ne00 = src0->ne[0];
5802
6260
  const int64_t ne01 = src0->ne[1];
5803
6261
  const int64_t ne02 = src0->ne[2];
5804
6262
  const int64_t ne03 = src0->ne[3];
5805
6263
  const int64_t nrows0 = ggml_nrows(src0);
5806
6264
 
5807
- const bool use_src1 = src1 != nullptr;
5808
- const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
5809
- const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
5810
- const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
5811
- const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
5812
- const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
6265
+ const int64_t ne10 = src1->ne[0];
6266
+ const int64_t ne11 = src1->ne[1];
6267
+ const int64_t ne12 = src1->ne[2];
6268
+ const int64_t ne13 = src1->ne[3];
6269
+ const int64_t nrows1 = ggml_nrows(src1);
5813
6270
 
5814
6271
  GGML_ASSERT(ne03 == ne13);
5815
6272
 
5816
6273
  const int64_t ne0 = dst->ne[0];
5817
6274
  const int64_t ne1 = dst->ne[1];
5818
6275
 
5819
- const int nb2 = dst->nb[2];
5820
- const int nb3 = dst->nb[3];
6276
+ const int nb2 = dst->nb[2];
6277
+ const int nb3 = dst->nb[3];
5821
6278
 
5822
6279
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
5823
- GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
6280
+ GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
5824
6281
 
5825
- // strides for iteration over dims 3 and 2
5826
- const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5827
- const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
5828
- const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
5829
- const int64_t src0_stride = ne00 * ne01 * stride_mod;
5830
- const int64_t src1_stride = ne10 * ne11 * stride_mod;
5831
- const int64_t dst_stride = ne0 * ne1 * stride_mod;
6282
+ GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
5832
6283
 
5833
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
5834
- const int64_t i03_max = flatten_rows ? 1 : ne03;
5835
- const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
5836
- const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5837
- GGML_ASSERT(!(flatten_rows && ne02 < ne12));
6284
+ const int64_t i02_divisor = ne12 / ne02;
5838
6285
 
5839
6286
  const size_t src0_ts = ggml_type_size(src0->type);
5840
6287
  const size_t src0_bs = ggml_blck_size(src0->type);
6288
+ const size_t q8_1_ts = sizeof(block_q8_1);
6289
+ const size_t q8_1_bs = QK8_1;
5841
6290
 
5842
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
5843
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
5844
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6291
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6292
+ struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
6293
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
5845
6294
 
5846
6295
  const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5847
6296
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
5848
- const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5849
6297
 
5850
- const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1);
5851
- const bool src1_stays_on_host = use_src1 && (
5852
- dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
6298
+ const bool src1_is_contiguous = ggml_is_contiguous(src1);
6299
+ const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
6300
+ ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5853
6301
 
5854
6302
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
6303
+ GGML_ASSERT(!(split && ne02 > 1));
6304
+ GGML_ASSERT(!(split && ne03 > 1));
5855
6305
  GGML_ASSERT(!(split && ne02 < ne12));
5856
6306
 
5857
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
5858
-
5859
6307
  // dd = data device
5860
- char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
5861
- float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
5862
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5863
- float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
5864
-
5865
- // asq = actual size quantized, asf = actual size float
5866
- size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
5867
- size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
6308
+ char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6309
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
6310
+ char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
6311
+ float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
6312
+
6313
+ // as = actual size
6314
+ size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
5868
6315
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
5869
- size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
6316
+ size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
6317
+ size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
5870
6318
 
5871
- // if multiple devices are used they need to wait for the main device
5872
- // here an event is recorded that signifies that the main device has finished calculating the input data
5873
- if (split && g_device_count > 1) {
5874
- CUDA_CHECK(cudaSetDevice(g_main_device));
5875
- CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
5876
- }
6319
+ int64_t row_low[GGML_CUDA_MAX_DEVICES];
6320
+ int64_t row_high[GGML_CUDA_MAX_DEVICES];
5877
6321
 
5878
- for (int id = 0; id < g_device_count; ++id) {
5879
- if (!split && id != g_main_device) {
5880
- continue;
5881
- }
5882
-
5883
- const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5884
- const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6322
+ for (int64_t id = 0; id < g_device_count; ++id) {
6323
+ // by default, use all rows
6324
+ row_low[id] = 0;
6325
+ row_high[id] = ne01;
5885
6326
 
5886
- int64_t row_low, row_high;
6327
+ // for multi GPU, get the row boundaries from tensor split
6328
+ // and round to mul_mat_q tile sizes
5887
6329
  if (split) {
5888
6330
  const int64_t rounding = get_row_rounding(src0->type);
5889
6331
 
5890
- row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5891
- row_low -= row_low % rounding;
6332
+ if (id != 0) {
6333
+ row_low[id] = ne01*g_tensor_split[id];
6334
+ row_low[id] -= row_low[id] % rounding;
6335
+ }
5892
6336
 
5893
- if (id == g_device_count - 1) {
5894
- row_high = nrows0;
5895
- } else {
5896
- row_high = nrows0*g_tensor_split[id + 1];
5897
- row_high -= row_high % rounding;
6337
+ if (id != g_device_count - 1) {
6338
+ row_high[id] = ne01*g_tensor_split[id + 1];
6339
+ row_high[id] -= row_high[id] % rounding;
5898
6340
  }
5899
- } else {
5900
- row_low = 0;
5901
- row_high = nrows0*i02_divisor;
5902
6341
  }
5903
- if (row_low == row_high) {
6342
+ }
6343
+
6344
+ for (int64_t id = 0; id < g_device_count; ++id) {
6345
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
5904
6346
  continue;
5905
6347
  }
5906
6348
 
5907
- int64_t row_diff = row_high - row_low;
5908
-
5909
- cudaSetDevice(id);
5910
- cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6349
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6350
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5911
6351
 
5912
- // wait for main GPU data if necessary
5913
- if (split && id != g_main_device) {
5914
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
5915
- }
6352
+ ggml_cuda_set_device(id);
6353
+ const cudaStream_t stream = g_cudaStreams[id][0];
5916
6354
 
5917
6355
  if (src0_on_device && src0_is_contiguous) {
5918
- if (src0_is_f32) {
5919
- src0_ddf[id] = (float *) src0_extra->data_device[id];
5920
- } else {
5921
- src0_ddq[id] = (char *) src0_extra->data_device[id];
5922
- }
6356
+ src0_dd[id] = (char *) src0_extra->data_device[id];
5923
6357
  } else {
5924
- if (src0_is_f32) {
5925
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
5926
- } else {
5927
- src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
5928
- }
6358
+ const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
6359
+ src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
5929
6360
  }
5930
6361
 
5931
- if (src0_needs_f32 && !src0_is_f32) {
5932
- src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
6362
+ if (src1_on_device && src1_is_contiguous) {
6363
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
6364
+ } else {
6365
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
5933
6366
  }
5934
6367
 
5935
- if (use_src1 && !src1_stays_on_host) {
5936
- if (src1_on_device && src1_is_contiguous) {
5937
- src1_ddf[id] = (float *) src1_extra->data_device[id];
5938
- } else {
5939
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
6368
+ if (convert_src1_to_q8_1) {
6369
+ src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
6370
+
6371
+ if (split && src1_on_device && src1_is_contiguous) {
6372
+ quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
6373
+ CUDA_CHECK(cudaGetLastError());
5940
6374
  }
5941
6375
  }
6376
+
5942
6377
  if (dst_on_device) {
5943
- dst_ddf[id] = (float *) dst_extra->data_device[id];
6378
+ dst_dd[id] = (float *) dst_extra->data_device[id];
5944
6379
  } else {
5945
- size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
5946
- dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
6380
+ const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
6381
+ dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
5947
6382
  }
6383
+ }
5948
6384
 
5949
- for (int64_t i03 = 0; i03 < i03_max; i03++) {
5950
- const int64_t i13 = i03 % ne13;
5951
- for (int64_t i02 = 0; i02 < i02_max; i02++) {
5952
- const int64_t i12 = i02 % ne12;
6385
+ // if multiple devices are used they need to wait for the main device
6386
+ // here an event is recorded that signals that the main device has finished calculating the input data
6387
+ if (split && g_device_count > 1) {
6388
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6389
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
6390
+ }
5953
6391
 
5954
- const int64_t i0 = i03*i02_max + i02;
6392
+ const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
6393
+ for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
6394
+ const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
6395
+ const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
5955
6396
 
5956
- // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
5957
- const int64_t i0_offset_low = row_low/rows_per_iter;
5958
- const int64_t i0_offset_high = row_high/rows_per_iter;
6397
+ for (int64_t id = 0; id < g_device_count; ++id) {
6398
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
6399
+ continue;
6400
+ }
5959
6401
 
5960
- int64_t i01_low = 0;
5961
- int64_t i01_high = rows_per_iter;
5962
- if (split) {
5963
- if (i0 < i0_offset_low || i0 > i0_offset_high) {
5964
- continue;
5965
- }
5966
- if (i0 == i0_offset_low) {
5967
- i01_low = row_low % rows_per_iter;
5968
- }
5969
- if (i0 == i0_offset_high) {
5970
- i01_high = row_high % rows_per_iter;
5971
- }
5972
- }
6402
+ const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
6403
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
6404
+ const int64_t row_diff = row_high[id] - row_low[id];
5973
6405
 
5974
- // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
5975
- // Removing the first assert or changing the order of the arguments causes the second assert to fail.
5976
- // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
5977
- // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
5978
- GGML_ASSERT(i01_low == 0 || g_device_count > 1);
5979
- GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1);
6406
+ ggml_cuda_set_device(id);
6407
+ const cudaStream_t stream = g_cudaStreams[id][is];
5980
6408
 
5981
- const int64_t i01_diff = i01_high - i01_low;
5982
- if (i01_diff == 0) {
5983
- continue;
5984
- }
5985
- const int64_t i11 = i13*ne12 + i12;
6409
+ // wait for main GPU data if necessary
6410
+ if (split && (id != g_main_device || is != 0)) {
6411
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6412
+ }
6413
+
6414
+ for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
6415
+ const int64_t i03 = i0 / ne12;
6416
+ const int64_t i02 = i0 % ne12;
6417
+
6418
+ const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
5986
6419
 
5987
6420
  // for split tensors the data begins at i0 == i0_offset_low
5988
- char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
5989
- float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
5990
- float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
5991
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
5992
-
5993
- // for split tensors the data pointer needs to be rounded down
5994
- // to the bin edge for i03, i02 bins beyond the first
5995
- if (i0 - i0_offset_low > 0) {
5996
- GGML_ASSERT(!flatten_rows);
5997
- src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
5998
- src0_ddf_i -= (row_low % ne01)*ne00;
5999
- dst_ddf_i -= (row_low % ne0)*ne1;
6000
- }
6421
+ char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
6422
+ float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
6423
+ char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
6424
+ float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
6001
6425
 
6002
6426
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
6003
6427
  // in that case an offset on dst_ddf_i is needed
6004
6428
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6005
- dst_ddf_i += i01_low; // offset is 0 if no tensor split
6429
+ dst_dd_i += row_low[id]; // offset is 0 if no tensor split
6006
6430
  }
6007
6431
 
6008
6432
  // copy src0, src1 to device if necessary
6009
- if (use_src1 && !src1_stays_on_host) {
6010
- if (src1->backend == GGML_BACKEND_CPU) {
6011
- GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
6012
- int64_t nrows1 = flatten_rows ? nrows0 : ne11;
6013
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
6014
- } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6015
- if (id != g_main_device) {
6016
- GGML_ASSERT(!flatten_rows);
6433
+ if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6434
+ if (id != g_main_device) {
6435
+ if (convert_src1_to_q8_1) {
6436
+ char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
6437
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
6438
+ cudaMemcpyDeviceToDevice, stream));
6439
+ } else {
6017
6440
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
6018
- src1_ddf_i_source += i11*src1_stride;
6019
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
6020
- cudaMemcpyDeviceToDevice, cudaStream_main));
6441
+ src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
6442
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
6443
+ cudaMemcpyDeviceToDevice, stream));
6021
6444
  }
6022
- } else if (src1_on_device && !src1_is_contiguous) {
6023
- GGML_ASSERT(!split);
6024
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main));
6025
- } else {
6026
- GGML_ASSERT(false);
6027
6445
  }
6446
+ } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
6447
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
6448
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
6449
+ } else {
6450
+ GGML_ASSERT(false);
6028
6451
  }
6029
6452
 
6030
- if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6031
- if (src0_is_f32) {
6032
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6033
- } else {
6034
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6035
- }
6453
+ if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
6454
+ quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
6455
+ CUDA_CHECK(cudaGetLastError());
6036
6456
  }
6037
6457
 
6038
- // convert src0 to f32 if it is necessary for the ggml_cuda_op
6039
- if (src0_needs_f32 && !src0_is_f32) {
6040
- to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6041
- CUDA_CHECK(cudaGetLastError());
6458
+ if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
6459
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
6042
6460
  }
6043
6461
 
6044
6462
  // do the computation
6045
- op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6463
+ op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
6464
+ row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
6046
6465
  CUDA_CHECK(cudaGetLastError());
6047
6466
 
6048
6467
  // copy dst to host or other device if necessary
@@ -6064,95 +6483,86 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
6064
6483
  // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6065
6484
  // Instead they need to be copied to the correct slice in ne0 = dst row index.
6066
6485
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6067
- float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
6068
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
6069
- i01_diff*sizeof(float), ne1, kind, cudaStream_main));
6486
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6487
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6488
+ dhf_dst_i += src1_col_0*ne0 + row_low[id];
6489
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
6490
+ row_diff*sizeof(float), src1_ncols, kind, stream));
6070
6491
  } else {
6071
6492
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6072
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
6493
+ GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
6494
+ dhf_dst_i += src1_col_0*ne0;
6495
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
6073
6496
  }
6074
6497
  }
6075
6498
 
6076
- // signify to main device that other device is done
6077
- if (split && g_device_count > 1 && id != g_main_device) {
6078
- CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
6499
+ // add event for the main device to wait on until other device is done
6500
+ if (split && (id != g_main_device || is != 0)) {
6501
+ CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
6079
6502
  }
6080
6503
  }
6081
6504
  }
6082
6505
  }
6083
6506
 
6084
- // wait until each device is finished, then free their buffers
6085
- for (int id = 0; id < g_device_count; ++id) {
6086
- if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
6087
- continue;
6088
- }
6089
-
6090
- CUDA_CHECK(cudaSetDevice(id));
6507
+ for (int64_t id = 0; id < g_device_count; ++id) {
6508
+ CUDA_CHECK(ggml_cuda_set_device(id));
6091
6509
 
6092
- if (src0_asq[id] > 0) {
6093
- ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
6094
- }
6095
- if (src0_asf[id] > 0) {
6096
- ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
6510
+ // free buffers again when done
6511
+ if (src0_as[id] > 0) {
6512
+ ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
6097
6513
  }
6098
6514
  if (src1_asf[id] > 0) {
6099
6515
  ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
6100
6516
  }
6101
- if (dst_asf[id] > 0) {
6102
- ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
6517
+ if (src1_asq[id] > 0) {
6518
+ ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
6519
+ }
6520
+ if (dst_as[id] > 0) {
6521
+ ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
6103
6522
  }
6104
6523
  }
6105
6524
 
6106
6525
  // main device waits for all other devices to be finished
6107
6526
  if (split && g_device_count > 1) {
6108
- CUDA_CHECK(cudaSetDevice(g_main_device));
6109
- for (int id = 0; id < g_device_count; ++id) {
6110
- if (id != g_main_device && src0_extra->events[id]) {
6111
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
6527
+ int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
6528
+ is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
6529
+
6530
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
+ for (int64_t id = 0; id < g_device_count; ++id) {
6532
+ for (int64_t is = 0; is < is_max; ++is) {
6533
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6112
6534
  }
6113
6535
  }
6114
6536
  }
6115
6537
 
6116
6538
  if (dst->backend == GGML_BACKEND_CPU) {
6117
- CUDA_CHECK(cudaSetDevice(g_main_device));
6539
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6118
6540
  CUDA_CHECK(cudaDeviceSynchronize());
6119
6541
  }
6120
6542
  }
6121
6543
 
6122
6544
  void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6123
- // ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
6124
- // Due to flatten_rows == true this does in practice not make a difference however.
6125
- // Better solution would be nice but right now that would require disproportionate changes.
6126
- GGML_ASSERT(
6127
- (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
6128
- src1->type == GGML_TYPE_F32 &&
6129
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
6130
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
6545
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6131
6546
  }
6132
6547
 
6133
6548
  void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6134
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6135
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
6549
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6136
6550
  }
6137
6551
 
6138
6552
  void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6139
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6140
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
6553
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6141
6554
  }
6142
6555
 
6143
6556
  void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6144
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6145
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
6557
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6146
6558
  }
6147
6559
 
6148
6560
  void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6149
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6150
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
6561
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6151
6562
  }
6152
6563
 
6153
6564
  void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6154
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6155
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
6565
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6156
6566
  }
6157
6567
 
6158
6568
  bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@@ -6186,8 +6596,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6186
6596
 
6187
6597
  const int64_t ne12 = src1->ne[2];
6188
6598
 
6189
- CUDA_CHECK(cudaSetDevice(g_main_device));
6190
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6599
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6600
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6191
6601
 
6192
6602
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6193
6603
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6198,7 +6608,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6198
6608
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6199
6609
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6200
6610
 
6201
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
6611
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6202
6612
  }
6203
6613
 
6204
6614
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -6217,8 +6627,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6217
6627
  const int64_t nb01 = src0->nb[1];
6218
6628
  const int64_t nb02 = src0->nb[2];
6219
6629
 
6220
- CUDA_CHECK(cudaSetDevice(g_main_device));
6221
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6630
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6631
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6222
6632
 
6223
6633
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6224
6634
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -6229,38 +6639,49 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6229
6639
  struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
6230
6640
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
6231
6641
 
6232
- const int row_stride_x = nb01 / sizeof(half);
6233
- const int channel_stride_x = nb02 / sizeof(half);
6642
+ const int64_t row_stride_x = nb01 / sizeof(half);
6643
+ const int64_t channel_stride_x = nb02 / sizeof(half);
6234
6644
 
6235
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
6645
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6236
6646
  }
6237
6647
 
6238
6648
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6239
6649
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6240
6650
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6241
6651
 
6652
+ int64_t min_compute_capability = INT_MAX;
6653
+ for (int64_t id = 0; id < g_device_count; ++id) {
6654
+ if (min_compute_capability > g_compute_capabilities[id]
6655
+ && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6656
+ min_compute_capability = g_compute_capabilities[id];
6657
+ }
6658
+ }
6659
+
6242
6660
  if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
6243
6661
  ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
6244
6662
  } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
6245
6663
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
6246
6664
  }else if (src0->type == GGML_TYPE_F32) {
6247
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6665
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6248
6666
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
6249
6667
  if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
6250
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
6251
- } else {
6252
- int min_compute_capability = INT_MAX;
6253
- for (int id = 0; id < g_device_count; ++id) {
6254
- if (min_compute_capability > g_compute_capabilities[id]
6255
- && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
6256
- min_compute_capability = g_compute_capabilities[id];
6257
- }
6258
- }
6259
6668
 
6669
+ #ifdef GGML_CUDA_FORCE_DMMV
6670
+ const bool use_mul_mat_vec_q = false;
6671
+ #else
6672
+ const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
6673
+ #endif // GGML_CUDA_FORCE_DMMV
6674
+
6675
+ if (use_mul_mat_vec_q) {
6676
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
6677
+ } else {
6678
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
6679
+ }
6680
+ } else {
6260
6681
  if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
6261
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
6682
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
6262
6683
  } else {
6263
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
6684
+ ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
6264
6685
  }
6265
6686
  }
6266
6687
  } else {
@@ -6269,8 +6690,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6269
6690
  }
6270
6691
 
6271
6692
  void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6272
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6273
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true);
6693
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6274
6694
  }
6275
6695
 
6276
6696
  void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6299,8 +6719,8 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6299
6719
  const int64_t nb11 = src1->nb[1];
6300
6720
  const int64_t nb12 = src1->nb[2];
6301
6721
 
6302
- CUDA_CHECK(cudaSetDevice(g_main_device));
6303
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6722
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6723
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
6304
6724
 
6305
6725
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
6306
6726
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -6310,10 +6730,10 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6310
6730
 
6311
6731
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
6312
6732
  ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6313
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6733
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6314
6734
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
6315
6735
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6316
- ne10, ne11, nb10, nb11, nb12, cudaStream_main);
6736
+ ne10, ne11, nb10, nb11, nb12, main_stream);
6317
6737
  } else {
6318
6738
  GGML_ASSERT(false);
6319
6739
  }
@@ -6327,28 +6747,20 @@ void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6327
6747
  }
6328
6748
 
6329
6749
  void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6330
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6331
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
6750
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6332
6751
  }
6333
6752
 
6334
6753
  void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6335
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6336
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true);
6754
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6337
6755
  }
6338
6756
 
6339
6757
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6340
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
6758
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6342
-
6343
- const int mode = ((int32_t *) dst->op_params)[2];
6344
- const bool is_glm = mode & 4;
6345
-
6346
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6759
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6347
6760
  }
6348
6761
 
6349
6762
  void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6350
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6351
- ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6763
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6352
6764
  }
6353
6765
 
6354
6766
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6358,7 +6770,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6358
6770
  }
6359
6771
 
6360
6772
  void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6361
- int nrows = ggml_nrows(tensor);
6773
+ const int64_t nrows = ggml_nrows(tensor);
6362
6774
 
6363
6775
  const int64_t ne0 = tensor->ne[0];
6364
6776
 
@@ -6368,14 +6780,14 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6368
6780
  struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
6369
6781
  memset(extra, 0, sizeof(*extra));
6370
6782
 
6371
- for (int id = 0; id < g_device_count; ++id) {
6783
+ for (int64_t id = 0; id < g_device_count; ++id) {
6372
6784
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
6373
6785
  continue;
6374
6786
  }
6375
6787
 
6376
- cudaSetDevice(id);
6788
+ ggml_cuda_set_device(id);
6377
6789
 
6378
- int row_low, row_high;
6790
+ int64_t row_low, row_high;
6379
6791
  if (backend == GGML_BACKEND_GPU) {
6380
6792
  row_low = 0;
6381
6793
  row_high = nrows;
@@ -6425,7 +6837,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
6425
6837
  extra->data_device[id] = buf;
6426
6838
 
6427
6839
  if (backend == GGML_BACKEND_GPU_SPLIT) {
6428
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
6840
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6841
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
6842
+ }
6429
6843
  }
6430
6844
  }
6431
6845
 
@@ -6439,15 +6853,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
6439
6853
 
6440
6854
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6441
6855
 
6442
- for (int id = 0; id < g_device_count; ++id) {
6856
+ for (int64_t id = 0; id < g_device_count; ++id) {
6443
6857
  if (extra->data_device[id] != nullptr) {
6444
- CUDA_CHECK(cudaSetDevice(id));
6858
+ CUDA_CHECK(ggml_cuda_set_device(id));
6445
6859
  CUDA_CHECK(cudaFree(extra->data_device[id]));
6446
6860
  }
6447
6861
 
6448
- if (extra->events[id] != nullptr) {
6449
- CUDA_CHECK(cudaSetDevice(id));
6450
- CUDA_CHECK(cudaEventDestroy(extra->events[id]));
6862
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
6863
+ if (extra->events[id][is] != nullptr) {
6864
+ CUDA_CHECK(ggml_cuda_set_device(id));
6865
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
6866
+ }
6451
6867
  }
6452
6868
  }
6453
6869
 
@@ -6499,7 +6915,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6499
6915
  force_inplace;
6500
6916
  const size_t size = ggml_nbytes(tensor);
6501
6917
 
6502
- CUDA_CHECK(cudaSetDevice(g_main_device));
6918
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6503
6919
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6504
6920
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6505
6921
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];