llama_cpp 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ #include <algorithm>
1
2
  #include <cstddef>
2
3
  #include <cstdint>
3
4
  #include <limits>
@@ -14,9 +15,11 @@
14
15
  // for rocblas_initialize()
15
16
  #include "rocblas/rocblas.h"
16
17
  #endif // __HIP_PLATFORM_AMD__
18
+ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
17
19
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
20
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
21
  #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
22
+ #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
20
23
  #define CUBLAS_OP_N HIPBLAS_OP_N
21
24
  #define CUBLAS_OP_T HIPBLAS_OP_T
22
25
  #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
@@ -31,6 +34,9 @@
31
34
  #define cublasSetStream hipblasSetStream
32
35
  #define cublasSgemm hipblasSgemm
33
36
  #define cublasStatus_t hipblasStatus_t
37
+ #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
38
+ #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
39
+ #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
34
40
  #define cudaDeviceProp hipDeviceProp_t
35
41
  #define cudaDeviceSynchronize hipDeviceSynchronize
36
42
  #define cudaError_t hipError_t
@@ -61,7 +67,7 @@
61
67
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
68
  #define cudaStreamNonBlocking hipStreamNonBlocking
63
69
  #define cudaStreamSynchronize hipStreamSynchronize
64
- #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
70
+ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
65
71
  #define cudaStream_t hipStream_t
66
72
  #define cudaSuccess hipSuccess
67
73
  #else
@@ -190,6 +196,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190
196
  } while (0)
191
197
  #endif // CUDART_VERSION >= 11
192
198
 
199
+ #if CUDART_VERSION >= 11100
200
+ #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
201
+ #else
202
+ #define GGML_CUDA_ASSUME(x)
203
+ #endif // CUDART_VERSION >= 11100
204
+
193
205
  #ifdef GGML_CUDA_F16
194
206
  typedef half dfloat; // dequantize float
195
207
  typedef half2 dfloat2;
@@ -226,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
226
238
  return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
227
239
  }
228
240
 
241
+ template<typename T>
242
+ using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
243
+ typedef to_t_cuda_t<float> to_fp32_cuda_t;
244
+ typedef to_t_cuda_t<half> to_fp16_cuda_t;
245
+
229
246
  typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
230
- typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
231
247
  typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
232
248
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
233
249
  typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
@@ -418,6 +434,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
418
434
  static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
419
435
  #endif
420
436
 
437
+ #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
438
+ #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
439
+ #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
440
+
421
441
  #define MUL_MAT_SRC1_COL_STRIDE 128
422
442
 
423
443
  #define MAX_STREAMS 8
@@ -448,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
448
468
  static bool g_mul_mat_q = true;
449
469
 
450
470
  static void * g_scratch_buffer = nullptr;
451
- static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
471
+ static size_t g_scratch_size = 0; // disabled by default
452
472
  static size_t g_scratch_offset = 0;
453
473
 
454
474
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -1502,6 +1522,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1502
1522
  v.y = x[ib + iqs + 1];
1503
1523
  }
1504
1524
 
1525
+ static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1526
+ const float * x = (const float *) vx;
1527
+
1528
+ // automatic half -> float type cast if dfloat == float
1529
+ v.x = x[ib + iqs + 0];
1530
+ v.y = x[ib + iqs + 1];
1531
+ }
1532
+
1505
1533
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1506
1534
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1507
1535
 
@@ -1541,8 +1569,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1541
1569
  reinterpret_cast<half&>(y[ib].ds.y) = sum;
1542
1570
  }
1543
1571
 
1544
- template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1545
- static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
1572
+ template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1573
+ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1546
1574
  const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1547
1575
 
1548
1576
  if (i >= k) {
@@ -2145,10 +2173,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2145
2173
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2146
2174
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2147
2175
 
2148
- __builtin_assume(i_offset >= 0);
2149
- __builtin_assume(i_offset < nwarps);
2150
- __builtin_assume(k >= 0);
2151
- __builtin_assume(k < WARP_SIZE);
2176
+ GGML_CUDA_ASSUME(i_offset >= 0);
2177
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2178
+ GGML_CUDA_ASSUME(k >= 0);
2179
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2152
2180
 
2153
2181
  const int kbx = k / QI4_0;
2154
2182
  const int kqsx = k % QI4_0;
@@ -2239,10 +2267,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2239
2267
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2240
2268
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2241
2269
 
2242
- __builtin_assume(i_offset >= 0);
2243
- __builtin_assume(i_offset < nwarps);
2244
- __builtin_assume(k >= 0);
2245
- __builtin_assume(k < WARP_SIZE);
2270
+ GGML_CUDA_ASSUME(i_offset >= 0);
2271
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2272
+ GGML_CUDA_ASSUME(k >= 0);
2273
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2246
2274
 
2247
2275
  const int kbx = k / QI4_1;
2248
2276
  const int kqsx = k % QI4_1;
@@ -2331,10 +2359,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2331
2359
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2332
2360
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2333
2361
 
2334
- __builtin_assume(i_offset >= 0);
2335
- __builtin_assume(i_offset < nwarps);
2336
- __builtin_assume(k >= 0);
2337
- __builtin_assume(k < WARP_SIZE);
2362
+ GGML_CUDA_ASSUME(i_offset >= 0);
2363
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2364
+ GGML_CUDA_ASSUME(k >= 0);
2365
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2338
2366
 
2339
2367
  const int kbx = k / QI5_0;
2340
2368
  const int kqsx = k % QI5_0;
@@ -2445,10 +2473,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2445
2473
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2446
2474
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2447
2475
 
2448
- __builtin_assume(i_offset >= 0);
2449
- __builtin_assume(i_offset < nwarps);
2450
- __builtin_assume(k >= 0);
2451
- __builtin_assume(k < WARP_SIZE);
2476
+ GGML_CUDA_ASSUME(i_offset >= 0);
2477
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2478
+ GGML_CUDA_ASSUME(k >= 0);
2479
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2452
2480
 
2453
2481
  const int kbx = k / QI5_1;
2454
2482
  const int kqsx = k % QI5_1;
@@ -2551,10 +2579,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2551
2579
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2552
2580
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2553
2581
 
2554
- __builtin_assume(i_offset >= 0);
2555
- __builtin_assume(i_offset < nwarps);
2556
- __builtin_assume(k >= 0);
2557
- __builtin_assume(k < WARP_SIZE);
2582
+ GGML_CUDA_ASSUME(i_offset >= 0);
2583
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2584
+ GGML_CUDA_ASSUME(k >= 0);
2585
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2558
2586
 
2559
2587
  const int kbx = k / QI8_0;
2560
2588
  const int kqsx = k % QI8_0;
@@ -2642,10 +2670,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2642
2670
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2643
2671
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2644
2672
 
2645
- __builtin_assume(i_offset >= 0);
2646
- __builtin_assume(i_offset < nwarps);
2647
- __builtin_assume(k >= 0);
2648
- __builtin_assume(k < WARP_SIZE);
2673
+ GGML_CUDA_ASSUME(i_offset >= 0);
2674
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2675
+ GGML_CUDA_ASSUME(k >= 0);
2676
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2649
2677
 
2650
2678
  const int kbx = k / QI2_K;
2651
2679
  const int kqsx = k % QI2_K;
@@ -2763,10 +2791,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2763
2791
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2764
2792
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2765
2793
 
2766
- __builtin_assume(i_offset >= 0);
2767
- __builtin_assume(i_offset < nwarps);
2768
- __builtin_assume(k >= 0);
2769
- __builtin_assume(k < WARP_SIZE);
2794
+ GGML_CUDA_ASSUME(i_offset >= 0);
2795
+ GGML_CUDA_ASSUME(i_offset < nwarps);
2796
+ GGML_CUDA_ASSUME(k >= 0);
2797
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2770
2798
 
2771
2799
  const int kbx = k / QI3_K;
2772
2800
  const int kqsx = k % QI3_K;
@@ -2981,10 +3009,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2981
3009
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
2982
3010
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
2983
3011
 
2984
- __builtin_assume(i_offset >= 0);
2985
- __builtin_assume(i_offset < nwarps);
2986
- __builtin_assume(k >= 0);
2987
- __builtin_assume(k < WARP_SIZE);
3012
+ GGML_CUDA_ASSUME(i_offset >= 0);
3013
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3014
+ GGML_CUDA_ASSUME(k >= 0);
3015
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
2988
3016
 
2989
3017
  const int kbx = k / QI4_K; // == 0 if QK_K == 256
2990
3018
  const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3162,10 +3190,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3162
3190
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3163
3191
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3164
3192
 
3165
- __builtin_assume(i_offset >= 0);
3166
- __builtin_assume(i_offset < nwarps);
3167
- __builtin_assume(k >= 0);
3168
- __builtin_assume(k < WARP_SIZE);
3193
+ GGML_CUDA_ASSUME(i_offset >= 0);
3194
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3195
+ GGML_CUDA_ASSUME(k >= 0);
3196
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3169
3197
 
3170
3198
  const int kbx = k / QI5_K; // == 0 if QK_K == 256
3171
3199
  const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3291,10 +3319,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3291
3319
  const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
3292
3320
  int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
3293
3321
 
3294
- __builtin_assume(i_offset >= 0);
3295
- __builtin_assume(i_offset < nwarps);
3296
- __builtin_assume(k >= 0);
3297
- __builtin_assume(k < WARP_SIZE);
3322
+ GGML_CUDA_ASSUME(i_offset >= 0);
3323
+ GGML_CUDA_ASSUME(i_offset < nwarps);
3324
+ GGML_CUDA_ASSUME(k >= 0);
3325
+ GGML_CUDA_ASSUME(k < WARP_SIZE);
3298
3326
 
3299
3327
  const int kbx = k / QI6_K; // == 0 if QK_K == 256
3300
3328
  const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -4342,8 +4370,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4342
4370
  }
4343
4371
 
4344
4372
  // rope == RoPE == rotary positional embedding
4345
- static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
4346
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4373
+
4374
+ template<typename T, bool has_pos>
4375
+ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4376
+ const int p_delta_rows, const float theta_scale) {
4347
4377
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4348
4378
 
4349
4379
  if (col >= ncols) {
@@ -4352,8 +4382,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4352
4382
 
4353
4383
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4354
4384
  const int i = row*ncols + col;
4385
+ const int i2 = row/p_delta_rows;
4355
4386
 
4356
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4387
+ const int p = has_pos ? pos[i2] : 0;
4388
+ const float p0 = p*freq_scale;
4389
+ const float theta = p0*powf(theta_scale, col/2);
4357
4390
  const float sin_theta = sinf(theta);
4358
4391
  const float cos_theta = cosf(theta);
4359
4392
 
@@ -4364,8 +4397,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
4364
4397
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
4365
4398
  }
4366
4399
 
4367
- static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4368
- const float p_delta, const int p_delta_rows, const float theta_scale) {
4400
+ template<typename T, bool has_pos>
4401
+ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
4402
+ const int p_delta_rows, const float theta_scale) {
4369
4403
  const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4370
4404
 
4371
4405
  if (col >= ncols) {
@@ -4374,8 +4408,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4374
4408
 
4375
4409
  const int row = blockDim.x*blockIdx.x + threadIdx.x;
4376
4410
  const int i = row*ncols + col/2;
4411
+ const int i2 = row/p_delta_rows;
4377
4412
 
4378
- const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4413
+ const int p = has_pos ? pos[i2] : 0;
4414
+ const float p0 = p*freq_scale;
4415
+ const float theta = p0*powf(theta_scale, col/2);
4379
4416
  const float sin_theta = sinf(theta);
4380
4417
  const float cos_theta = cosf(theta);
4381
4418
 
@@ -4386,8 +4423,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
4386
4423
  dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4387
4424
  }
4388
4425
 
4389
- static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
4390
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
4426
+ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
4427
+ const int p_delta_rows, const float theta_scale, const int n_ctx) {
4391
4428
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4392
4429
  const int half_n_dims = ncols/4;
4393
4430
 
@@ -4397,11 +4434,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4397
4434
 
4398
4435
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4399
4436
  const int i = row*ncols + col;
4437
+ const int i2 = row/p_delta_rows;
4400
4438
 
4401
4439
  const float col_theta_scale = powf(theta_scale, col);
4402
- const float p = p0 + p_delta*(row/p_delta_rows);
4440
+ // FIXME: this is likely wrong
4441
+ const int p = pos != nullptr ? pos[i2] : 0;
4403
4442
 
4404
- const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
4443
+ const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
4405
4444
  const float sin_theta = sinf(theta);
4406
4445
  const float cos_theta = cosf(theta);
4407
4446
 
@@ -4411,7 +4450,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
4411
4450
  dst[i + 0] = x0*cos_theta - x1*sin_theta;
4412
4451
  dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
4413
4452
 
4414
- const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
4453
+ const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
4415
4454
  const float sin_block_theta = sinf(block_theta);
4416
4455
  const float cos_block_theta = cosf(block_theta);
4417
4456
 
@@ -4813,6 +4852,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
4813
4852
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4814
4853
  }
4815
4854
 
4855
+ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
4856
+ const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
4857
+ dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
4858
+ }
4859
+
4816
4860
  static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
4817
4861
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
4818
4862
  const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
@@ -4822,6 +4866,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
4822
4866
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
4823
4867
  }
4824
4868
 
4869
+ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
4870
+ switch (type) {
4871
+ case GGML_TYPE_F32:
4872
+ return convert_fp32_to_fp16_cuda;
4873
+ default:
4874
+ return nullptr;
4875
+ }
4876
+ }
4877
+
4825
4878
  static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
4826
4879
  switch (type) {
4827
4880
  case GGML_TYPE_Q4_0:
@@ -5348,31 +5401,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
5348
5401
  scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
5349
5402
  }
5350
5403
 
5351
- static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5352
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5404
+ template<typename T>
5405
+ static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5406
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5353
5407
  GGML_ASSERT(ncols % 2 == 0);
5354
5408
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5355
5409
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5356
5410
  const dim3 block_nums(nrows, num_blocks_x, 1);
5357
- rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5411
+ if (pos == nullptr) {
5412
+ rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5413
+ } else {
5414
+ rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5415
+ }
5358
5416
  }
5359
5417
 
5360
- static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5361
- const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5418
+ template<typename T>
5419
+ static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5420
+ const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
5362
5421
  GGML_ASSERT(ncols % 2 == 0);
5363
5422
  const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
5364
5423
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
5365
5424
  const dim3 block_nums(nrows, num_blocks_x, 1);
5366
- rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
5425
+ if (pos == nullptr) {
5426
+ rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5427
+ } else {
5428
+ rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
5429
+ }
5367
5430
  }
5368
5431
 
5369
- static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
5370
- const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5432
+ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
5433
+ const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
5371
5434
  GGML_ASSERT(ncols % 4 == 0);
5372
5435
  const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
5373
5436
  const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
5374
5437
  const dim3 block_nums(num_blocks_x, nrows, 1);
5375
- rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
5438
+ rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
5376
5439
  }
5377
5440
 
5378
5441
  static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -6003,8 +6066,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6003
6066
  GGML_ASSERT(src1_ddf_i != nullptr);
6004
6067
  GGML_ASSERT(dst_dd_i != nullptr);
6005
6068
 
6006
- const float alpha = 1.0f;
6007
- const float beta = 0.0f;
6008
6069
 
6009
6070
  const int64_t ne00 = src0->ne[0];
6010
6071
 
@@ -6013,16 +6074,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
6013
6074
  const int64_t ne0 = dst->ne[0];
6014
6075
  const int64_t row_diff = row_high - row_low;
6015
6076
 
6016
- float * src0_ddq_as_f32;
6017
- size_t src0_as = 0;
6018
-
6019
- if (src0->type != GGML_TYPE_F32) {
6020
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6021
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6022
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6023
- }
6024
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6025
-
6026
6077
  int id;
6027
6078
  CUDA_CHECK(cudaGetDevice(&id));
6028
6079
 
@@ -6030,16 +6081,72 @@ inline void ggml_cuda_op_mul_mat_cublas(
6030
6081
  // ldc == nrows of the matrix that cuBLAS writes into
6031
6082
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
6032
6083
 
6033
- CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6034
- CUBLAS_CHECK(
6035
- cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6036
- row_diff, src1_ncols, ne10,
6037
- &alpha, src0_ddf_i, ne00,
6038
- src1_ddf_i, ne10,
6039
- &beta, dst_dd_i, ldc));
6084
+ const int compute_capability = g_compute_capabilities[id];
6085
+
6086
+ if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
6087
+ // convert src1 to fp16, multiply as fp16, convert dst to fp32
6088
+ half * src1_as_f16 = nullptr;
6089
+ size_t src1_as = 0;
6090
+ if (src1->type != GGML_TYPE_F16) {
6091
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
6092
+ GGML_ASSERT(to_fp16_cuda != nullptr);
6093
+ size_t ne = src1_ncols*ne10;
6094
+ src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
6095
+ to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
6096
+ }
6097
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
6098
+
6099
+ size_t dst_as = 0;
6100
+ half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
6101
+
6102
+ const half alpha_f16 = 1.0f;
6103
+ const half beta_f16 = 0.0f;
6104
+
6105
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6106
+ CUBLAS_CHECK(
6107
+ cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6108
+ row_diff, src1_ncols, ne10,
6109
+ &alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
6110
+ src1_ptr, CUDA_R_16F, ne10,
6111
+ &beta_f16, dst_f16, CUDA_R_16F, ldc,
6112
+ CUBLAS_COMPUTE_16F,
6113
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
6040
6114
 
6041
- if (src0_as > 0) {
6042
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6115
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
6116
+ to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
6117
+
6118
+ ggml_cuda_pool_free(dst_f16, dst_as);
6119
+
6120
+ if (src1_as != 0) {
6121
+ ggml_cuda_pool_free(src1_as_f16, src1_as);
6122
+ }
6123
+ }
6124
+ else {
6125
+ float * src0_ddq_as_f32 = nullptr;
6126
+ size_t src0_as = 0;
6127
+
6128
+ if (src0->type != GGML_TYPE_F32) {
6129
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
6130
+ GGML_ASSERT(to_fp32_cuda != nullptr);
6131
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
6132
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
6133
+ }
6134
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
6135
+
6136
+ const float alpha = 1.0f;
6137
+ const float beta = 0.0f;
6138
+
6139
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
6140
+ CUBLAS_CHECK(
6141
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
6142
+ row_diff, src1_ncols, ne10,
6143
+ &alpha, src0_ddf_i, ne00,
6144
+ src1_ddf_i, ne10,
6145
+ &beta, dst_dd_i, ldc));
6146
+
6147
+ if (src0_as != 0) {
6148
+ ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
6149
+ }
6043
6150
  }
6044
6151
 
6045
6152
  (void) dst;
@@ -6051,14 +6158,16 @@ inline void ggml_cuda_op_rope(
6051
6158
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6052
6159
  const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6053
6160
 
6054
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
6055
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
6161
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
6162
+ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
6163
+ GGML_ASSERT(src0->type == dst->type);
6056
6164
 
6057
6165
  const int64_t ne00 = src0->ne[0];
6058
6166
  const int64_t ne01 = src0->ne[1];
6167
+ const int64_t ne2 = dst->ne[2];
6059
6168
  const int64_t nrows = ggml_nrows(src0);
6060
6169
 
6061
- const int n_past = ((int32_t *) dst->op_params)[0];
6170
+ //const int n_past = ((int32_t *) dst->op_params)[0];
6062
6171
  const int n_dims = ((int32_t *) dst->op_params)[1];
6063
6172
  const int mode = ((int32_t *) dst->op_params)[2];
6064
6173
  const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -6069,19 +6178,38 @@ inline void ggml_cuda_op_rope(
6069
6178
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
6070
6179
 
6071
6180
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
6072
- const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
6181
+
6182
+ const int32_t * pos = nullptr;
6183
+ if ((mode & 1) == 0) {
6184
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
6185
+ GGML_ASSERT(src1->ne[0] == ne2);
6186
+ pos = (const int32_t *) src1_dd;
6187
+ }
6073
6188
 
6074
6189
  const bool is_neox = mode & 2;
6075
6190
  const bool is_glm = mode & 4;
6076
6191
 
6077
6192
  // compute
6078
6193
  if (is_glm) {
6079
- rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6194
+ GGML_ASSERT(false);
6195
+ rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
6080
6196
  } else if (is_neox) {
6081
6197
  GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
6082
- rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6198
+ if (src0->type == GGML_TYPE_F32) {
6199
+ rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6200
+ } else if (src0->type == GGML_TYPE_F16) {
6201
+ rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6202
+ } else {
6203
+ GGML_ASSERT(false);
6204
+ }
6083
6205
  } else {
6084
- rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
6206
+ if (src0->type == GGML_TYPE_F32) {
6207
+ rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6208
+ } else if (src0->type == GGML_TYPE_F16) {
6209
+ rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
6210
+ } else {
6211
+ GGML_ASSERT(false);
6212
+ }
6085
6213
  }
6086
6214
 
6087
6215
  (void) src1;
@@ -6252,6 +6380,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
6252
6380
  }
6253
6381
  }
6254
6382
 
6383
+ static void ggml_cuda_set_peer_access(const int n_tokens) {
6384
+ static bool peer_access_enabled = false;
6385
+
6386
+ const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
6387
+
6388
+ if (peer_access_enabled == enable_peer_access) {
6389
+ return;
6390
+ }
6391
+
6392
+ #ifdef NDEBUG
6393
+ for (int id = 0; id < g_device_count; ++id) {
6394
+ CUDA_CHECK(ggml_cuda_set_device(id));
6395
+
6396
+ for (int id_other = 0; id_other < g_device_count; ++id_other) {
6397
+ if (id == id_other) {
6398
+ continue;
6399
+ }
6400
+ if (id != g_main_device && id_other != g_main_device) {
6401
+ continue;
6402
+ }
6403
+
6404
+ int can_access_peer;
6405
+ CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
6406
+ if (can_access_peer) {
6407
+ if (enable_peer_access) {
6408
+ CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
6409
+ } else {
6410
+ CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
6411
+ }
6412
+ }
6413
+ }
6414
+ }
6415
+ #endif // NDEBUG
6416
+
6417
+ peer_access_enabled = enable_peer_access;
6418
+ }
6419
+
6255
6420
  static void ggml_cuda_op_mul_mat(
6256
6421
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
6257
6422
  const bool convert_src1_to_q8_1) {
@@ -6276,6 +6441,8 @@ static void ggml_cuda_op_mul_mat(
6276
6441
  const int nb2 = dst->nb[2];
6277
6442
  const int nb3 = dst->nb[3];
6278
6443
 
6444
+ ggml_cuda_set_peer_access(ne11);
6445
+
6279
6446
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
6280
6447
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
6281
6448
 
@@ -6408,7 +6575,7 @@ static void ggml_cuda_op_mul_mat(
6408
6575
 
6409
6576
  // wait for main GPU data if necessary
6410
6577
  if (split && (id != g_main_device || is != 0)) {
6411
- CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6578
+ CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
6412
6579
  }
6413
6580
 
6414
6581
  for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -6530,7 +6697,7 @@ static void ggml_cuda_op_mul_mat(
6530
6697
  CUDA_CHECK(ggml_cuda_set_device(g_main_device));
6531
6698
  for (int64_t id = 0; id < g_device_count; ++id) {
6532
6699
  for (int64_t is = 0; is < is_max; ++is) {
6533
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6700
+ CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
6534
6701
  }
6535
6702
  }
6536
6703
  }
@@ -6541,27 +6708,27 @@ static void ggml_cuda_op_mul_mat(
6541
6708
  }
6542
6709
  }
6543
6710
 
6544
- void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6711
+ static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6545
6712
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
6546
6713
  }
6547
6714
 
6548
- void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6715
+ static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6549
6716
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
6550
6717
  }
6551
6718
 
6552
- void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6719
+ static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6553
6720
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
6554
6721
  }
6555
6722
 
6556
- void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6723
+ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6557
6724
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
6558
6725
  }
6559
6726
 
6560
- void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6727
+ static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6561
6728
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
6562
6729
  }
6563
6730
 
6564
- void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6731
+ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6565
6732
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
6566
6733
  }
6567
6734
 
@@ -6572,17 +6739,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
6572
6739
  const int64_t ne1 = dst->ne[1];
6573
6740
 
6574
6741
  // TODO: find the optimal values for these
6575
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6576
- src1->type == GGML_TYPE_F32 &&
6577
- dst->type == GGML_TYPE_F32 &&
6578
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
6579
- return true;
6580
- }
6581
-
6582
- return false;
6742
+ return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
6743
+ src1->type == GGML_TYPE_F32 &&
6744
+ dst->type == GGML_TYPE_F32 &&
6745
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
6583
6746
  }
6584
6747
 
6585
- void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6748
+ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6586
6749
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
6587
6750
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
6588
6751
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -6611,7 +6774,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
6611
6774
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
6612
6775
  }
6613
6776
 
6614
- void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6777
+ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
6615
6778
  GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
6616
6779
  GGML_ASSERT(!ggml_is_permuted(src0));
6617
6780
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@@ -6645,7 +6808,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
6645
6808
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
6646
6809
  }
6647
6810
 
6648
- void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6811
+ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6649
6812
  bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
6650
6813
  src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
6651
6814
 
@@ -6689,11 +6852,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6689
6852
  }
6690
6853
  }
6691
6854
 
6692
- void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6855
+ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6693
6856
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
6694
6857
  }
6695
6858
 
6696
- void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6859
+ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6697
6860
  const int64_t ne = ggml_nelements(src0);
6698
6861
  GGML_ASSERT(ne == ggml_nelements(src1));
6699
6862
 
@@ -6735,35 +6898,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6735
6898
  ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
6736
6899
  ne10, ne11, nb10, nb11, nb12, main_stream);
6737
6900
  } else {
6901
+ fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
6902
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
6738
6903
  GGML_ASSERT(false);
6739
6904
  }
6740
6905
 
6741
6906
  (void) dst;
6742
6907
  }
6743
6908
 
6744
- void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6909
+ static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6745
6910
  ggml_cuda_cpy(src0, dst, nullptr);
6746
6911
  (void) src1;
6747
6912
  }
6748
6913
 
6749
- void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6914
+ static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6750
6915
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
6751
6916
  }
6752
6917
 
6753
- void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6918
+ static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6754
6919
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
6755
6920
  }
6756
6921
 
6757
- void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6922
+ static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6758
6923
  GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6759
6924
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
6760
6925
  }
6761
6926
 
6762
- void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6927
+ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6763
6928
  ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
6764
6929
  }
6765
6930
 
6766
- void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6931
+ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6767
6932
  (void) src0;
6768
6933
  (void) src1;
6769
6934
  (void) dst;
@@ -6886,11 +7051,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6886
7051
  return extra;
6887
7052
  }
6888
7053
 
6889
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
7054
+ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6890
7055
  if (scratch && g_scratch_size == 0) {
6891
7056
  return;
6892
7057
  }
6893
7058
 
7059
+ tensor->backend = GGML_BACKEND_GPU;
7060
+
6894
7061
  // recursively assign CUDA buffers until a compute tensor is found
6895
7062
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6896
7063
  const ggml_op src0_op = tensor->src[0]->op;
@@ -6902,8 +7069,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6902
7069
  ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6903
7070
  }
6904
7071
 
6905
- tensor->backend = GGML_BACKEND_GPU;
6906
-
6907
7072
  if (scratch && no_alloc) {
6908
7073
  return;
6909
7074
  }
@@ -6964,6 +7129,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6964
7129
  return;
6965
7130
  }
6966
7131
  if (g_scratch_buffer == nullptr) {
7132
+ ggml_cuda_set_device(g_main_device);
6967
7133
  CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6968
7134
  }
6969
7135
 
@@ -6987,6 +7153,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
6987
7153
  tensor->extra = extra;
6988
7154
  }
6989
7155
 
7156
+ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
7157
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
7158
+ GGML_ASSERT(ggml_is_contiguous(tensor));
7159
+
7160
+ struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
7161
+ CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7162
+ CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
7163
+ }
7164
+
6990
7165
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6991
7166
  ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6992
7167
  }
@@ -7003,7 +7178,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
7003
7178
  ggml_cuda_assign_buffers_impl(tensor, false, true, false);
7004
7179
  }
7005
7180
 
7006
- void ggml_cuda_set_main_device(int main_device) {
7181
+ void ggml_cuda_set_main_device(const int main_device) {
7007
7182
  if (main_device >= g_device_count) {
7008
7183
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
7009
7184
  main_device, g_device_count, g_main_device);
@@ -7017,12 +7192,17 @@ void ggml_cuda_set_main_device(int main_device) {
7017
7192
  }
7018
7193
  }
7019
7194
 
7020
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
7195
+ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
7021
7196
  g_mul_mat_q = mul_mat_q;
7022
7197
  }
7023
7198
 
7024
- void ggml_cuda_set_scratch_size(size_t scratch_size) {
7025
- g_scratch_size = scratch_size;
7199
+ void ggml_cuda_set_scratch_size(const size_t scratch_size) {
7200
+ // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
7201
+ // it still won't always work as expected, but it's better than nothing
7202
+ if (scratch_size > g_scratch_size) {
7203
+ ggml_cuda_free_scratch();
7204
+ }
7205
+ g_scratch_size = std::max(g_scratch_size, scratch_size);
7026
7206
  }
7027
7207
 
7028
7208
  void ggml_cuda_free_scratch() {