llama_cpp 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ #include <algorithm>
1
2
  #include <cstddef>
2
3
  #include <cstdint>
3
4
  #include <limits>
@@ -14,9 +15,11 @@
14
15
  // for rocblas_initialize()
15
16
  #include "rocblas/rocblas.h"
16
17
  #endif // __HIP_PLATFORM_AMD__
18
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
17
19
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
20
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
21
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
22
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
20
23
  #define CUBLAS_OP_N HIPBLAS_OP_N
21
24
  #define CUBLAS_OP_T HIPBLAS_OP_T
22
25
  #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@@ -31,6 +34,9 @@
31
34
  #define cublasSetStream hipblasSetStream
32
35
  #define cublasSgemm hipblasSgemm
33
36
  #define cublasStatus_t hipblasStatus_t
37
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
38
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
39
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
40
  #define cudaDeviceProp hipDeviceProp_t
35
41
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
42
  #define cudaError_t hipError_t
@@ -61,7 +67,7 @@
61
67
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
68
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
69
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
70
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
71
  #define cudaStream_t hipStream_t
66
72
  #define cudaSuccess hipSuccess
67
73
  #else
@@ -190,6 +196,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
196
  } while (0)
191
197
  #endif // CUDART_VERSION >= 11
192
198
 
199
+ #if CUDART_VERSION >= 11100
200
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
201
+ #else
202
+ #define GGML_CUDA_ASSUME(x)
203
+ #endif // CUDART_VERSION >= 11100
204
+
193
205
  #ifdef GGML_CUDA_F16
194
206
  typedef half dfloat; // dequantize float
195
207
  typedef half2 dfloat2;
@@ -226,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
226
238
  return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
227
239
  }
228
240
 
241
+ template<typename T>
242
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
243
+ typedef to_t_cuda_t<float> to_fp32_cuda_t;
244
+ typedef to_t_cuda_t<half> to_fp16_cuda_t;
245
+
229
246
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
230
- typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
231
247
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
232
248
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
233
249
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@@ -418,6 +434,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
418
434
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
419
435
  #endif
420
436
 
437
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
438
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
439
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
440
+
421
441
  #define MUL_MAT_SRC1_COL_STRIDE 128
422
442
 
423
443
  #define MAX_STREAMS 8
@@ -448,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
448
468
  static bool g_mul_mat_q = true;
449
469
 
450
470
  static void * g_scratch_buffer = nullptr;
451
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
471
+ static size_t g_scratch_size = 0; // disabled by default
452
472
  static size_t g_scratch_offset = 0;
453
473
 
454
474
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1502,6 +1522,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1502
1522
  v.y = x[ib + iqs + 1];
1503
1523
  }
1504
1524
 
1525
+ static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1526
+ const float * x = (const float *) vx;
1527
+
1528
+ // automatic half -> float type cast if dfloat == float
1529
+ v.x = x[ib + iqs + 0];
1530
+ v.y = x[ib + iqs + 1];
1531
+ }
1532
+
1505
1533
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1506
1534
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1507
1535
 
@@ -1541,8 +1569,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1541
1569
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1542
1570
  }
1543
1571
 
1544
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1545
- static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1572
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1573
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1546
1574
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1547
1575
 
1548
1576
  if (i >= k) {
@@ -2145,10 +2173,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2145
2173
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2146
2174
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2147
2175
 
2148
- __builtin_assume(i_offset >= 0);
2149
- __builtin_assume(i_offset < nwarps);
2150
- __builtin_assume(k >= 0);
2151
- __builtin_assume(k < WARP_SIZE);
2176
+ GGML_CUDA_ASSUME(i_offset >= 0);
2177
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2178
+ GGML_CUDA_ASSUME(k >= 0);
2179
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2152
2180
 
2153
2181
  const int kbx = k / QI4_0;
2154
2182
  const int kqsx = k % QI4_0;
@@ -2239,10 +2267,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2239
2267
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2240
2268
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2241
2269
 
2242
- __builtin_assume(i_offset >= 0);
2243
- __builtin_assume(i_offset < nwarps);
2244
- __builtin_assume(k >= 0);
2245
- __builtin_assume(k < WARP_SIZE);
2270
+ GGML_CUDA_ASSUME(i_offset >= 0);
2271
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2272
+ GGML_CUDA_ASSUME(k >= 0);
2273
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2246
2274
 
2247
2275
  const int kbx = k / QI4_1;
2248
2276
  const int kqsx = k % QI4_1;
@@ -2331,10 +2359,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2331
2359
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2360
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2333
2361
 
2334
- __builtin_assume(i_offset >= 0);
2335
- __builtin_assume(i_offset < nwarps);
2336
- __builtin_assume(k >= 0);
2337
- __builtin_assume(k < WARP_SIZE);
2362
+ GGML_CUDA_ASSUME(i_offset >= 0);
2363
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2364
+ GGML_CUDA_ASSUME(k >= 0);
2365
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2338
2366
 
2339
2367
  const int kbx = k / QI5_0;
2340
2368
  const int kqsx = k % QI5_0;
@@ -2445,10 +2473,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2445
2473
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2446
2474
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2447
2475
 
2448
- __builtin_assume(i_offset >= 0);
2449
- __builtin_assume(i_offset < nwarps);
2450
- __builtin_assume(k >= 0);
2451
- __builtin_assume(k < WARP_SIZE);
2476
+ GGML_CUDA_ASSUME(i_offset >= 0);
2477
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2478
+ GGML_CUDA_ASSUME(k >= 0);
2479
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2452
2480
 
2453
2481
  const int kbx = k / QI5_1;
2454
2482
  const int kqsx = k % QI5_1;
@@ -2551,10 +2579,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2551
2579
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2552
2580
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2553
2581
 
2554
- __builtin_assume(i_offset >= 0);
2555
- __builtin_assume(i_offset < nwarps);
2556
- __builtin_assume(k >= 0);
2557
- __builtin_assume(k < WARP_SIZE);
2582
+ GGML_CUDA_ASSUME(i_offset >= 0);
2583
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2584
+ GGML_CUDA_ASSUME(k >= 0);
2585
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2558
2586
 
2559
2587
  const int kbx = k / QI8_0;
2560
2588
  const int kqsx = k % QI8_0;
@@ -2642,10 +2670,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2642
2670
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2643
2671
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2644
2672
 
2645
- __builtin_assume(i_offset >= 0);
2646
- __builtin_assume(i_offset < nwarps);
2647
- __builtin_assume(k >= 0);
2648
- __builtin_assume(k < WARP_SIZE);
2673
+ GGML_CUDA_ASSUME(i_offset >= 0);
2674
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2675
+ GGML_CUDA_ASSUME(k >= 0);
2676
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2649
2677
 
2650
2678
  const int kbx = k / QI2_K;
2651
2679
  const int kqsx = k % QI2_K;
@@ -2763,10 +2791,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2763
2791
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2764
2792
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2765
2793
 
2766
- __builtin_assume(i_offset >= 0);
2767
- __builtin_assume(i_offset < nwarps);
2768
- __builtin_assume(k >= 0);
2769
- __builtin_assume(k < WARP_SIZE);
2794
+ GGML_CUDA_ASSUME(i_offset >= 0);
2795
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2796
+ GGML_CUDA_ASSUME(k >= 0);
2797
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2770
2798
 
2771
2799
  const int kbx = k / QI3_K;
2772
2800
  const int kqsx = k % QI3_K;
@@ -2981,10 +3009,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2981
3009
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2982
3010
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2983
3011
 
2984
- __builtin_assume(i_offset >= 0);
2985
- __builtin_assume(i_offset < nwarps);
2986
- __builtin_assume(k >= 0);
2987
- __builtin_assume(k < WARP_SIZE);
3012
+ GGML_CUDA_ASSUME(i_offset >= 0);
3013
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3014
+ GGML_CUDA_ASSUME(k >= 0);
3015
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2988
3016
 
2989
3017
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2990
3018
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3162,10 +3190,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3162
3190
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3163
3191
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3164
3192
 
3165
- __builtin_assume(i_offset >= 0);
3166
- __builtin_assume(i_offset < nwarps);
3167
- __builtin_assume(k >= 0);
3168
- __builtin_assume(k < WARP_SIZE);
3193
+ GGML_CUDA_ASSUME(i_offset >= 0);
3194
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3195
+ GGML_CUDA_ASSUME(k >= 0);
3196
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3169
3197
 
3170
3198
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3171
3199
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3291,10 +3319,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3291
3319
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3292
3320
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3293
3321
 
3294
- __builtin_assume(i_offset >= 0);
3295
- __builtin_assume(i_offset < nwarps);
3296
- __builtin_assume(k >= 0);
3297
- __builtin_assume(k < WARP_SIZE);
3322
+ GGML_CUDA_ASSUME(i_offset >= 0);
3323
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3324
+ GGML_CUDA_ASSUME(k >= 0);
3325
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3298
3326
 
3299
3327
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3300
3328
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -4342,8 +4370,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4342
4370
  }
4343
4371
 
4344
4372
  // rope == RoPE == rotary positional embedding
4345
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
4346
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4373
+
4374
+ template<typename T, bool has_pos>
4375
+ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4376
+ const int p_delta_rows, const float theta_scale) {
4347
4377
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4348
4378
 
4349
4379
  if (col >= ncols) {
@@ -4352,8 +4382,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4352
4382
 
4353
4383
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4354
4384
  const int i = row*ncols + col;
4385
+ const int i2 = row/p_delta_rows;
4355
4386
 
4356
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4387
+ const int p = has_pos ? pos[i2] : 0;
4388
+ const float p0 = p*freq_scale;
4389
+ const float theta = p0*powf(theta_scale, col/2);
4357
4390
  const float sin_theta = sinf(theta);
4358
4391
  const float cos_theta = cosf(theta);
4359
4392
 
@@ -4364,8 +4397,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4364
4397
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
4365
4398
  }
4366
4399
 
4367
- static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4368
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4400
+ template<typename T, bool has_pos>
4401
+ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4402
+ const int p_delta_rows, const float theta_scale) {
4369
4403
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4370
4404
 
4371
4405
  if (col >= ncols) {
@@ -4374,8 +4408,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4374
4408
 
4375
4409
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4376
4410
  const int i = row*ncols + col/2;
4411
+ const int i2 = row/p_delta_rows;
4377
4412
 
4378
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4413
+ const int p = has_pos ? pos[i2] : 0;
4414
+ const float p0 = p*freq_scale;
4415
+ const float theta = p0*powf(theta_scale, col/2);
4379
4416
  const float sin_theta = sinf(theta);
4380
4417
  const float cos_theta = cosf(theta);
4381
4418
 
@@ -4386,8 +4423,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4386
4423
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4387
4424
  }
4388
4425
 
4389
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4426
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4427
+ const int p_delta_rows, const float theta_scale, const int n_ctx) {
4391
4428
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4392
4429
  const int half_n_dims = ncols/4;
4393
4430
 
@@ -4397,11 +4434,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4397
4434
 
4398
4435
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4399
4436
  const int i = row*ncols + col;
4437
+ const int i2 = row/p_delta_rows;
4400
4438
 
4401
4439
  const float col_theta_scale = powf(theta_scale, col);
4402
- const float p = p0 + p_delta*(row/p_delta_rows);
4440
+ // FIXME: this is likely wrong
4441
+ const int p = pos != nullptr ? pos[i2] : 0;
4403
4442
 
4404
- const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4443
+ const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
4405
4444
  const float sin_theta = sinf(theta);
4406
4445
  const float cos_theta = cosf(theta);
4407
4446
 
@@ -4411,7 +4450,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4411
4450
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4412
4451
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4413
4452
 
4414
- const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4453
+ const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
4415
4454
  const float sin_block_theta = sinf(block_theta);
4416
4455
  const float cos_block_theta = cosf(block_theta);
4417
4456
 
@@ -4813,6 +4852,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
4813
4852
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4814
4853
  }
4815
4854
 
4855
+ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
4856
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
4857
+ dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4858
+ }
4859
+
4816
4860
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4817
4861
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4818
4862
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -4822,6 +4866,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4822
4866
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4823
4867
  }
4824
4868
 
4869
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4870
+ switch (type) {
4871
+ case GGML_TYPE_F32:
4872
+ return convert_fp32_to_fp16_cuda;
4873
+ default:
4874
+ return nullptr;
4875
+ }
4876
+ }
4877
+
4825
4878
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
4826
4879
  switch (type) {
4827
4880
  case GGML_TYPE_Q4_0:
@@ -5348,31 +5401,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5348
5401
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5349
5402
  }
5350
5403
 
5351
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5352
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5404
+ template<typename T>
5405
+ static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5406
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5353
5407
  GGML_ASSERT(ncols % 2 == 0);
5354
5408
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5355
5409
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5356
5410
  const dim3 block_nums(nrows, num_blocks_x, 1);
5357
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5411
+ if (pos == nullptr) {
5412
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5413
+ } else {
5414
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5415
+ }
5358
5416
  }
5359
5417
 
5360
- static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5361
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5418
+ template<typename T>
5419
+ static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5420
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5362
5421
  GGML_ASSERT(ncols % 2 == 0);
5363
5422
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5364
5423
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5365
5424
  const dim3 block_nums(nrows, num_blocks_x, 1);
5366
- rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5425
+ if (pos == nullptr) {
5426
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5427
+ } else {
5428
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5429
+ }
5367
5430
  }
5368
5431
 
5369
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5432
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5433
+ const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
5434
  GGML_ASSERT(ncols % 4 == 0);
5372
5435
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
5436
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5374
5437
  const dim3 block_nums(num_blocks_x, nrows, 1);
5375
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
5438
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5376
5439
  }
5377
5440
 
5378
5441
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -6003,8 +6066,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6003
6066
  GGML_ASSERT(src1_ddf_i != nullptr);
6004
6067
  GGML_ASSERT(dst_dd_i != nullptr);
6005
6068
 
6006
- const float alpha = 1.0f;
6007
- const float beta = 0.0f;
6008
6069
 
6009
6070
  const int64_t ne00 = src0->ne[0];
6010
6071
 
@@ -6013,16 +6074,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6013
6074
  const int64_t ne0 = dst->ne[0];
6014
6075
  const int64_t row_diff = row_high - row_low;
6015
6076
 
6016
- float * src0_ddq_as_f32;
6017
- size_t src0_as = 0;
6018
-
6019
- if (src0->type != GGML_TYPE_F32) {
6020
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
- }
6024
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6025
-
6026
6077
  int id;
6027
6078
  CUDA_CHECK(cudaGetDevice(&id));
6028
6079
 
@@ -6030,16 +6081,72 @@ inline void ggml_cuda_op_mul_mat_cublas(
6030
6081
  // ldc == nrows of the matrix that cuBLAS writes into
6031
6082
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
6032
6083
 
6033
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6034
- CUBLAS_CHECK(
6035
- cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6036
- row_diff, src1_ncols, ne10,
6037
- &alpha, src0_ddf_i, ne00,
6038
- src1_ddf_i, ne10,
6039
- &beta, dst_dd_i, ldc));
6084
+ const int compute_capability = g_compute_capabilities[id];
6085
+
6086
+ if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
6087
+ // convert src1 to fp16, multiply as fp16, convert dst to fp32
6088
+ half * src1_as_f16 = nullptr;
6089
+ size_t src1_as = 0;
6090
+ if (src1->type != GGML_TYPE_F16) {
6091
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
6092
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6093
+ size_t ne = src1_ncols*ne10;
6094
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6095
+ to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6096
+ }
6097
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6098
+
6099
+ size_t dst_as = 0;
6100
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6101
+
6102
+ const half alpha_f16 = 1.0f;
6103
+ const half beta_f16 = 0.0f;
6104
+
6105
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6106
+ CUBLAS_CHECK(
6107
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6108
+ row_diff, src1_ncols, ne10,
6109
+ &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
6110
+ src1_ptr, CUDA_R_16F, ne10,
6111
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6112
+ CUBLAS_COMPUTE_16F,
6113
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6040
6114
 
6041
- if (src0_as > 0) {
6042
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6115
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
6116
+ to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6117
+
6118
+ ggml_cuda_pool_free(dst_f16, dst_as);
6119
+
6120
+ if (src1_as != 0) {
6121
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
6122
+ }
6123
+ }
6124
+ else {
6125
+ float * src0_ddq_as_f32 = nullptr;
6126
+ size_t src0_as = 0;
6127
+
6128
+ if (src0->type != GGML_TYPE_F32) {
6129
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6130
+ GGML_ASSERT(to_fp32_cuda != nullptr);
6131
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6132
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6133
+ }
6134
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6135
+
6136
+ const float alpha = 1.0f;
6137
+ const float beta = 0.0f;
6138
+
6139
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6140
+ CUBLAS_CHECK(
6141
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6142
+ row_diff, src1_ncols, ne10,
6143
+ &alpha, src0_ddf_i, ne00,
6144
+ src1_ddf_i, ne10,
6145
+ &beta, dst_dd_i, ldc));
6146
+
6147
+ if (src0_as != 0) {
6148
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6149
+ }
6043
6150
  }
6044
6151
 
6045
6152
  (void) dst;
@@ -6051,14 +6158,16 @@ inline void ggml_cuda_op_rope(
6051
6158
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
6159
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6053
6160
 
6054
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6161
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
6162
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
6163
+ GGML_ASSERT(src0->type == dst->type);
6056
6164
 
6057
6165
  const int64_t ne00 = src0->ne[0];
6058
6166
  const int64_t ne01 = src0->ne[1];
6167
+ const int64_t ne2 = dst->ne[2];
6059
6168
  const int64_t nrows = ggml_nrows(src0);
6060
6169
 
6061
- const int n_past = ((int32_t *) dst->op_params)[0];
6170
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6062
6171
  const int n_dims = ((int32_t *) dst->op_params)[1];
6063
6172
  const int mode = ((int32_t *) dst->op_params)[2];
6064
6173
  const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -6069,19 +6178,38 @@ inline void ggml_cuda_op_rope(
6069
6178
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6070
6179
 
6071
6180
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
6181
+
6182
+ const int32_t * pos = nullptr;
6183
+ if ((mode & 1) == 0) {
6184
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
6185
+ GGML_ASSERT(src1->ne[0] == ne2);
6186
+ pos = (const int32_t *) src1_dd;
6187
+ }
6073
6188
 
6074
6189
  const bool is_neox = mode & 2;
6075
6190
  const bool is_glm = mode & 4;
6076
6191
 
6077
6192
  // compute
6078
6193
  if (is_glm) {
6079
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6194
+ GGML_ASSERT(false);
6195
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6080
6196
  } else if (is_neox) {
6081
6197
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6082
- rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6198
+ if (src0->type == GGML_TYPE_F32) {
6199
+ rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6200
+ } else if (src0->type == GGML_TYPE_F16) {
6201
+ rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6202
+ } else {
6203
+ GGML_ASSERT(false);
6204
+ }
6083
6205
  } else {
6084
- rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6206
+ if (src0->type == GGML_TYPE_F32) {
6207
+ rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6208
+ } else if (src0->type == GGML_TYPE_F16) {
6209
+ rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6210
+ } else {
6211
+ GGML_ASSERT(false);
6212
+ }
6085
6213
  }
6086
6214
 
6087
6215
  (void) src1;
@@ -6252,6 +6380,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6252
6380
  }
6253
6381
  }
6254
6382
 
6383
+ static void ggml_cuda_set_peer_access(const int n_tokens) {
6384
+ static bool peer_access_enabled = false;
6385
+
6386
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6387
+
6388
+ if (peer_access_enabled == enable_peer_access) {
6389
+ return;
6390
+ }
6391
+
6392
+ #ifdef NDEBUG
6393
+ for (int id = 0; id < g_device_count; ++id) {
6394
+ CUDA_CHECK(ggml_cuda_set_device(id));
6395
+
6396
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6397
+ if (id == id_other) {
6398
+ continue;
6399
+ }
6400
+ if (id != g_main_device && id_other != g_main_device) {
6401
+ continue;
6402
+ }
6403
+
6404
+ int can_access_peer;
6405
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6406
+ if (can_access_peer) {
6407
+ if (enable_peer_access) {
6408
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6409
+ } else {
6410
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6411
+ }
6412
+ }
6413
+ }
6414
+ }
6415
+ #endif // NDEBUG
6416
+
6417
+ peer_access_enabled = enable_peer_access;
6418
+ }
6419
+
6255
6420
  static void ggml_cuda_op_mul_mat(
6256
6421
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
6422
  const bool convert_src1_to_q8_1) {
@@ -6276,6 +6441,8 @@ static void ggml_cuda_op_mul_mat(
6276
6441
  const int nb2 = dst->nb[2];
6277
6442
  const int nb3 = dst->nb[3];
6278
6443
 
6444
+ ggml_cuda_set_peer_access(ne11);
6445
+
6279
6446
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
6280
6447
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
6281
6448
 
@@ -6408,7 +6575,7 @@ static void ggml_cuda_op_mul_mat(
6408
6575
 
6409
6576
  // wait for main GPU data if necessary
6410
6577
  if (split && (id != g_main_device || is != 0)) {
6411
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6578
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6412
6579
  }
6413
6580
 
6414
6581
  for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -6530,7 +6697,7 @@ static void ggml_cuda_op_mul_mat(
6530
6697
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
6698
  for (int64_t id = 0; id < g_device_count; ++id) {
6532
6699
  for (int64_t is = 0; is < is_max; ++is) {
6533
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6700
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6534
6701
  }
6535
6702
  }
6536
6703
  }
@@ -6541,27 +6708,27 @@ static void ggml_cuda_op_mul_mat(
6541
6708
  }
6542
6709
  }
6543
6710
 
6544
- void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6711
+ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6545
6712
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6546
6713
  }
6547
6714
 
6548
- void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6715
+ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6549
6716
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6550
6717
  }
6551
6718
 
6552
- void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6719
+ static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6553
6720
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6554
6721
  }
6555
6722
 
6556
- void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6723
+ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6557
6724
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6558
6725
  }
6559
6726
 
6560
- void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6727
+ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6561
6728
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6562
6729
  }
6563
6730
 
6564
- void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6731
+ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6565
6732
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6566
6733
  }
6567
6734
 
@@ -6572,17 +6739,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
6572
6739
  const int64_t ne1 = dst->ne[1];
6573
6740
 
6574
6741
  // TODO: find the optimal values for these
6575
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6576
- src1->type == GGML_TYPE_F32 &&
6577
- dst->type == GGML_TYPE_F32 &&
6578
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
6579
- return true;
6580
- }
6581
-
6582
- return false;
6742
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6743
+ src1->type == GGML_TYPE_F32 &&
6744
+ dst->type == GGML_TYPE_F32 &&
6745
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
6583
6746
  }
6584
6747
 
6585
- void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6748
+ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6586
6749
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
6587
6750
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
6588
6751
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -6611,7 +6774,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6611
6774
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6612
6775
  }
6613
6776
 
6614
- void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6777
+ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6615
6778
  GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
6616
6779
  GGML_ASSERT(!ggml_is_permuted(src0));
6617
6780
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@@ -6645,7 +6808,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6645
6808
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6646
6809
  }
6647
6810
 
6648
- void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6811
+ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6649
6812
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6650
6813
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6651
6814
 
@@ -6689,11 +6852,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6689
6852
  }
6690
6853
  }
6691
6854
 
6692
- void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6855
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6693
6856
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6694
6857
  }
6695
6858
 
6696
- void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6859
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6697
6860
  const int64_t ne = ggml_nelements(src0);
6698
6861
  GGML_ASSERT(ne == ggml_nelements(src1));
6699
6862
 
@@ -6735,35 +6898,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6735
6898
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6736
6899
  ne10, ne11, nb10, nb11, nb12, main_stream);
6737
6900
  } else {
6901
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
6902
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
6738
6903
  GGML_ASSERT(false);
6739
6904
  }
6740
6905
 
6741
6906
  (void) dst;
6742
6907
  }
6743
6908
 
6744
- void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6909
+ static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6745
6910
  ggml_cuda_cpy(src0, dst, nullptr);
6746
6911
  (void) src1;
6747
6912
  }
6748
6913
 
6749
- void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6914
+ static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6750
6915
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6751
6916
  }
6752
6917
 
6753
- void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6918
+ static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6754
6919
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6755
6920
  }
6756
6921
 
6757
- void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6922
+ static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6758
6923
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6759
6924
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6760
6925
  }
6761
6926
 
6762
- void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6927
+ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6763
6928
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6764
6929
  }
6765
6930
 
6766
- void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6931
+ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6767
6932
  (void) src0;
6768
6933
  (void) src1;
6769
6934
  (void) dst;
@@ -6886,11 +7051,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6886
7051
  return extra;
6887
7052
  }
6888
7053
 
6889
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
7054
+ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6890
7055
  if (scratch && g_scratch_size == 0) {
6891
7056
  return;
6892
7057
  }
6893
7058
 
7059
+ tensor->backend = GGML_BACKEND_GPU;
7060
+
6894
7061
  // recursively assign CUDA buffers until a compute tensor is found
6895
7062
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6896
7063
  const ggml_op src0_op = tensor->src[0]->op;
@@ -6902,8 +7069,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6902
7069
  ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6903
7070
  }
6904
7071
 
6905
- tensor->backend = GGML_BACKEND_GPU;
6906
-
6907
7072
  if (scratch && no_alloc) {
6908
7073
  return;
6909
7074
  }
@@ -6964,6 +7129,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6964
7129
  return;
6965
7130
  }
6966
7131
  if (g_scratch_buffer == nullptr) {
7132
+ ggml_cuda_set_device(g_main_device);
6967
7133
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6968
7134
  }
6969
7135
 
@@ -6987,6 +7153,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6987
7153
  tensor->extra = extra;
6988
7154
  }
6989
7155
 
7156
+ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7157
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7158
+ GGML_ASSERT(ggml_is_contiguous(tensor));
7159
+
7160
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7161
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7162
+ CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7163
+ }
7164
+
6990
7165
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6991
7166
  ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6992
7167
  }
@@ -7003,7 +7178,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
7003
7178
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
7004
7179
  }
7005
7180
 
7006
- void ggml_cuda_set_main_device(int main_device) {
7181
+ void ggml_cuda_set_main_device(const int main_device) {
7007
7182
  if (main_device >= g_device_count) {
7008
7183
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
7009
7184
  main_device, g_device_count, g_main_device);
@@ -7017,12 +7192,17 @@ void ggml_cuda_set_main_device(int main_device) {
7017
7192
  }
7018
7193
  }
7019
7194
 
7020
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7195
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7021
7196
  g_mul_mat_q = mul_mat_q;
7022
7197
  }
7023
7198
 
7024
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7025
- g_scratch_size = scratch_size;
7199
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7200
+ // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7201
+ // it still won't always work as expected, but it's better than nothing
7202
+ if (scratch_size > g_scratch_size) {
7203
+ ggml_cuda_free_scratch();
7204
+ }
7205
+ g_scratch_size = std::max(g_scratch_size, scratch_size);
7026
7206
  }
7027
7207
 
7028
7208
  void ggml_cuda_free_scratch() {