llama_cpp 0.3.8 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -205,11 +306,11 @@ typedef struct {
205
306
  #define QI4_K (QK_K / (4*QR4_K))
206
307
  #ifdef GGML_QKK_64
207
308
  typedef struct {
208
- half d[2]; // super-block scales/mins
309
+ half dm[2]; // super-block scales/mins
209
310
  uint8_t scales[2]; // 4-bit block scales/mins
210
311
  uint8_t qs[QK_K/2]; // 4--bit quants
211
312
  } block_q4_K;
212
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
313
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
213
314
  #else
214
315
  typedef struct {
215
316
  half2 dm; // super-block scale for quantized scales/mins
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -635,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
635
737
  const int tid = threadIdx.x;
636
738
  const uint8_t * q = x[i].qs;
637
739
  float * y = yy + i*QK_K;
638
- const float d = (float)x[i].d[0];
639
- const float m = (float)x[i].d[1];
740
+ const float d = (float)x[i].dm[0];
741
+ const float m = (float)x[i].dm[1];
640
742
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
641
743
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
642
744
  #endif
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1053,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1053
1155
  const uint16_t * a = (const uint16_t *)x[i].scales;
1054
1156
  aux16[0] = a[0] & 0x0f0f;
1055
1157
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1056
- const float d = (float)x[i].d[0];
1057
- const float m = (float)x[i].d[1];
1158
+ const float d = (float)x[i].dm[0];
1159
+ const float m = (float)x[i].dm[1];
1058
1160
  float sum = 0.f;
1059
1161
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1060
1162
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2345
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2346
2448
  }
2347
2449
 
2348
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2349
2451
  }
2350
2452
 
2351
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2431
2533
  #pragma unroll
2432
2534
  for (int i = 0; i < QR2_K; ++ i) {
2433
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2434
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2435
2537
  }
2436
2538
 
2437
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2550
2652
  #pragma unroll
2551
2653
  for (int i = 0; i < QR3_K; ++i) {
2552
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2553
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2554
2656
  }
2555
2657
 
2556
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2719
2821
 
2720
2822
  for (int i = 0; i < QR4_K; ++i) {
2721
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2722
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2723
2825
 
2724
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2725
2827
  u[2*i+0] = q8[0];
@@ -2743,11 +2845,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2743
2845
  aux16[0] = a[0] & 0x0f0f;
2744
2846
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2745
2847
 
2746
- const float dall = bq4_K->d[0];
2747
- const float dmin = bq4_K->d[1];
2848
+ const float dall = bq4_K->dm[0];
2849
+ const float dmin = bq4_K->dm[1];
2748
2850
 
2749
- const float d8_1 = bq8_1[0].ds.x;
2750
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2751
2853
 
2752
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2753
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2827,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2827
2929
 
2828
2930
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2829
2931
 
2932
+ #if QK_K == 256
2830
2933
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2934
+ #else
2935
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2936
+ #endif
2831
2937
  }
2832
2938
 
2833
2939
  #pragma unroll
@@ -2900,7 +3006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2900
3006
  #pragma unroll
2901
3007
  for (int i = 0; i < QR5_K; ++i) {
2902
3008
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2903
- d8[i] = bq8i->ds.x;
3009
+ d8[i] = __low2float(bq8i->ds);
2904
3010
 
2905
3011
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2906
3012
  u[2*i+0] = q8[0];
@@ -2918,8 +3024,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2918
3024
 
2919
3025
  const float d = bq5_K->d;
2920
3026
 
2921
- const float d8_1 = bq8_1[0].ds.x;
2922
- const float d8_2 = bq8_1[1].ds.x;
3027
+ const float d8_1 = __low2half(bq8_1[0].ds);
3028
+ const float d8_2 = __low2half(bq8_1[1].ds);
2923
3029
 
2924
3030
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2925
3031
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3017,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3017
3123
 
3018
3124
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3019
3125
 
3126
+ #if QK_K == 256
3020
3127
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3128
+ #endif
3021
3129
  }
3022
3130
 
3023
3131
  #pragma unroll
@@ -3074,7 +3182,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3074
3182
  #pragma unroll
3075
3183
  for (int i = 0; i < QR6_K; ++i) {
3076
3184
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3077
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3185
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3078
3186
  }
3079
3187
 
3080
3188
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3242,7 +3350,7 @@ static __device__ __forceinline__ void mul_mat_q(
3242
3350
  *dsi_dst = *dsi_src;
3243
3351
  } else {
3244
3352
  float * dfi_dst = (float *) dsi_dst;
3245
- *dfi_dst = (*dsi_src).x;
3353
+ *dfi_dst = __low2half(*dsi_src);
3246
3354
  }
3247
3355
  }
3248
3356
 
@@ -3886,13 +3994,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3886
3994
  // rope == RoPE == rotary positional embedding
3887
3995
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3888
3996
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3889
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3997
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3890
3998
 
3891
3999
  if (col >= ncols) {
3892
4000
  return;
3893
4001
  }
3894
4002
 
3895
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
4003
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3896
4004
  const int i = row*ncols + col;
3897
4005
 
3898
4006
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3906,6 +4014,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3906
4014
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3907
4015
  }
3908
4016
 
4017
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4018
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4019
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4020
+
4021
+ if (col >= ncols) {
4022
+ return;
4023
+ }
4024
+
4025
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4026
+ const int i = row*ncols + col/2;
4027
+
4028
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4029
+ const float sin_theta = sinf(theta);
4030
+ const float cos_theta = cosf(theta);
4031
+
4032
+ const float x0 = x[i + 0];
4033
+ const float x1 = x[i + ncols/2];
4034
+
4035
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4036
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4037
+ }
4038
+
3909
4039
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3910
4040
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3911
4041
  const int half_n_dims = ncols/4;
@@ -3940,9 +4070,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3940
4070
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3941
4071
  }
3942
4072
 
3943
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4073
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4074
+ const int n_heads_log2_floor, const float m0, const float m1) {
3944
4075
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4076
+
4077
+ if (col >= ncols) {
4078
+ return;
4079
+ }
4080
+
3945
4081
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4082
+ const int i = row*ncols + col;
4083
+
4084
+ const int k = row/k_rows;
4085
+
4086
+ float m_k;
4087
+ if (k < n_heads_log2_floor) {
4088
+ m_k = powf(m0, k + 1);
4089
+ } else {
4090
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4091
+ }
4092
+
4093
+ dst[i] = col * m_k + x[i];
4094
+ }
4095
+
4096
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4097
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4098
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3946
4099
 
3947
4100
  if (col >= ncols) {
3948
4101
  return;
@@ -3955,24 +4108,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3955
4108
 
3956
4109
  // the CUDA soft max implementation differs from the CPU implementation
3957
4110
  // instead of doubles floats are used
3958
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3959
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3960
4111
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3961
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3962
- const int block_size = blockDim.x;
3963
- const int tid = threadIdx.x;
4112
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4113
+ const int block_size = blockDim.y;
4114
+ const int tid = threadIdx.y;
3964
4115
 
3965
- float tmp = 0.0;
4116
+ float max_val = -INFINITY;
3966
4117
 
3967
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3968
- const int col = block_start + tid;
4118
+ for (int col = tid; col < ncols; col += block_size) {
4119
+ const int i = row*ncols + col;
4120
+ max_val = max(max_val, x[i]);
4121
+ }
3969
4122
 
3970
- if (col >= ncols) {
3971
- break;
3972
- }
4123
+ // find the max value in the block
4124
+ #pragma unroll
4125
+ for (int mask = 16; mask > 0; mask >>= 1) {
4126
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4127
+ }
4128
+
4129
+ float tmp = 0.f;
3973
4130
 
4131
+ for (int col = tid; col < ncols; col += block_size) {
3974
4132
  const int i = row*ncols + col;
3975
- const float val = expf(x[i]);
4133
+ const float val = expf(x[i] - max_val);
3976
4134
  tmp += val;
3977
4135
  dst[i] = val;
3978
4136
  }
@@ -3983,15 +4141,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3983
4141
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3984
4142
  }
3985
4143
 
3986
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3987
- const int col = block_start + tid;
3988
-
3989
- if (col >= ncols) {
3990
- break;
3991
- }
4144
+ const float inv_tmp = 1.f / tmp;
3992
4145
 
4146
+ for (int col = tid; col < ncols; col += block_size) {
3993
4147
  const int i = row*ncols + col;
3994
- dst[i] /= tmp;
4148
+ dst[i] *= inv_tmp;
3995
4149
  }
3996
4150
  }
3997
4151
 
@@ -4561,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4561
4715
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4562
4716
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4563
4717
 
4718
+ #if QK_K == 256
4719
+
4564
4720
  int id;
4565
4721
  CUDA_CHECK(cudaGetDevice(&id));
4566
4722
  const int compute_capability = g_compute_capabilities[id];
@@ -4592,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4592
4748
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
4749
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4594
4750
  }
4751
+ #endif
4595
4752
  }
4596
4753
 
4597
4754
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4751,13 +4908,22 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4751
4908
 
4752
4909
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4753
4910
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4754
- GGML_ASSERT(nrows % 2 == 0);
4755
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4911
+ GGML_ASSERT(ncols % 2 == 0);
4912
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4756
4913
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4757
- const dim3 block_nums(num_blocks_x, nrows, 1);
4914
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4758
4915
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4759
4916
  }
4760
4917
 
4918
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4919
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4920
+ GGML_ASSERT(ncols % 2 == 0);
4921
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4922
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4923
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4924
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4925
+ }
4926
+
4761
4927
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4762
4928
  GGML_ASSERT(nrows % 4 == 0);
4763
4929
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4766,16 +4932,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4766
4932
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4767
4933
  }
4768
4934
 
4935
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4936
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4937
+ const float m1, cudaStream_t stream) {
4938
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4939
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4940
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4941
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4942
+ }
4943
+
4769
4944
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4770
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4945
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4771
4946
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4772
- const dim3 block_nums(block_num_x, nrows_x, 1);
4947
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4773
4948
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4774
4949
  }
4775
4950
 
4776
4951
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4777
- const dim3 block_dims(WARP_SIZE, 1, 1);
4778
- const dim3 block_nums(1, nrows_x, 1);
4952
+ const dim3 block_dims(1, WARP_SIZE, 1);
4953
+ const dim3 block_nums(nrows_x, 1, 1);
4779
4954
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4780
4955
  }
4781
4956
 
@@ -4880,10 +5055,18 @@ void ggml_init_cublas() {
4880
5055
  static bool initialized = false;
4881
5056
 
4882
5057
  if (!initialized) {
5058
+
5059
+ #ifdef __HIP_PLATFORM_AMD__
5060
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5061
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5062
+ rocblas_initialize();
5063
+ CUDA_CHECK(cudaDeviceSynchronize());
5064
+ #endif
5065
+
4883
5066
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4884
5067
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4885
5068
  int64_t total_vram = 0;
4886
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5069
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4887
5070
  for (int id = 0; id < g_device_count; ++id) {
4888
5071
  cudaDeviceProp prop;
4889
5072
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5481,7 +5664,8 @@ inline void ggml_cuda_op_rope(
5481
5664
 
5482
5665
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5483
5666
 
5484
- const bool is_glm = mode & 4;
5667
+ const bool is_neox = mode & 2;
5668
+ const bool is_glm = mode & 4;
5485
5669
 
5486
5670
  // compute
5487
5671
  if (is_glm) {
@@ -5489,6 +5673,10 @@ inline void ggml_cuda_op_rope(
5489
5673
  const float id_p = min(p, n_ctx - 2.f);
5490
5674
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5491
5675
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5676
+ } else if (is_neox) {
5677
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5678
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5679
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5492
5680
  } else {
5493
5681
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5494
5682
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5501,6 +5689,41 @@ inline void ggml_cuda_op_rope(
5501
5689
  (void) i1;
5502
5690
  }
5503
5691
 
5692
+ inline void ggml_cuda_op_alibi(
5693
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5694
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5695
+ cudaStream_t & cudaStream_main){
5696
+
5697
+ GGML_ASSERT(src0_ddf_i != nullptr);
5698
+ GGML_ASSERT(dst_ddf_i != nullptr);
5699
+
5700
+ const int64_t ne00 = src0->ne[0];
5701
+ const int64_t ne01 = src0->ne[1];
5702
+ const int64_t ne02 = src0->ne[2];
5703
+ const int64_t i01_diff = i01_high - i01_low;
5704
+
5705
+ const int n_past = ((int32_t *) dst->op_params)[0];
5706
+ const int n_head = ((int32_t *) dst->op_params)[1];
5707
+ float max_bias;
5708
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5709
+
5710
+ GGML_ASSERT(ne01 + n_past == ne00);
5711
+ GGML_ASSERT(n_head == ne02);
5712
+
5713
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5714
+
5715
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5716
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5717
+
5718
+ // compute
5719
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5720
+
5721
+ (void) src1;
5722
+ (void) src0_ddq_i;
5723
+ (void) src1_ddf_i;
5724
+ (void) i1;
5725
+ }
5726
+
5504
5727
  inline void ggml_cuda_op_diag_mask_inf(
5505
5728
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5506
5729
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -6115,12 +6338,19 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6115
6338
 
6116
6339
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6117
6340
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6118
6342
 
6119
6343
  const int mode = ((int32_t *) dst->op_params)[2];
6120
6344
  const bool is_glm = mode & 4;
6345
+
6121
6346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6122
6347
  }
6123
6348
 
6349
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6350
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6351
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6352
+ }
6353
+
6124
6354
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6125
6355
  (void) src0;
6126
6356
  (void) src1;
@@ -6240,7 +6470,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6240
6470
  return extra;
6241
6471
  }
6242
6472
 
6243
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6473
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6244
6474
  if (scratch && g_scratch_size == 0) {
6245
6475
  return;
6246
6476
  }
@@ -6249,14 +6479,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6249
6479
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6250
6480
  const ggml_op src0_op = tensor->src[0]->op;
6251
6481
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6252
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6482
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6253
6483
  }
6254
6484
  }
6255
6485
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6256
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6486
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6257
6487
  }
6258
6488
 
6259
6489
  tensor->backend = GGML_BACKEND_GPU;
6490
+
6491
+ if (scratch && no_alloc) {
6492
+ return;
6493
+ }
6494
+
6260
6495
  struct ggml_tensor_extra_gpu * extra;
6261
6496
 
6262
6497
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6308,16 +6543,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6308
6543
  tensor->extra = extra;
6309
6544
  }
6310
6545
 
6546
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6547
+ if (g_scratch_size == 0) {
6548
+ return;
6549
+ }
6550
+ if (g_scratch_buffer == nullptr) {
6551
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6552
+ }
6553
+
6554
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6555
+
6556
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6557
+ tensor->op == GGML_OP_VIEW;
6558
+
6559
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6560
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6561
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6562
+ size_t view_offset = 0;
6563
+ if (tensor->op == GGML_OP_VIEW) {
6564
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6565
+ }
6566
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6567
+ } else {
6568
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6569
+ }
6570
+
6571
+ tensor->extra = extra;
6572
+ }
6573
+
6311
6574
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6312
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6575
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6576
+ }
6577
+
6578
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6579
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6313
6580
  }
6314
6581
 
6315
6582
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6316
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6583
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6317
6584
  }
6318
6585
 
6319
6586
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6320
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6587
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6321
6588
  }
6322
6589
 
6323
6590
  void ggml_cuda_set_main_device(int main_device) {
@@ -6456,6 +6723,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6456
6723
  }
6457
6724
  func = ggml_cuda_rope;
6458
6725
  break;
6726
+ case GGML_OP_ALIBI:
6727
+ if (!any_on_device) {
6728
+ return false;
6729
+ }
6730
+ func = ggml_cuda_alibi;
6731
+ break;
6459
6732
  default:
6460
6733
  return false;
6461
6734
  }