llama_cpp 0.3.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -205,11 +306,11 @@ typedef struct {
205
306
  #define QI4_K (QK_K / (4*QR4_K))
206
307
  #ifdef GGML_QKK_64
207
308
  typedef struct {
208
- half d[2]; // super-block scales/mins
309
+ half dm[2]; // super-block scales/mins
209
310
  uint8_t scales[2]; // 4-bit block scales/mins
210
311
  uint8_t qs[QK_K/2]; // 4--bit quants
211
312
  } block_q4_K;
212
- static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
313
+ static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
213
314
  #else
214
315
  typedef struct {
215
316
  half2 dm; // super-block scale for quantized scales/mins
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -635,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
635
737
  const int tid = threadIdx.x;
636
738
  const uint8_t * q = x[i].qs;
637
739
  float * y = yy + i*QK_K;
638
- const float d = (float)x[i].d[0];
639
- const float m = (float)x[i].d[1];
740
+ const float d = (float)x[i].dm[0];
741
+ const float m = (float)x[i].dm[1];
640
742
  y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
641
743
  y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
642
744
  #endif
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1053,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
1053
1155
  const uint16_t * a = (const uint16_t *)x[i].scales;
1054
1156
  aux16[0] = a[0] & 0x0f0f;
1055
1157
  aux16[1] = (a[0] >> 4) & 0x0f0f;
1056
- const float d = (float)x[i].d[0];
1057
- const float m = (float)x[i].d[1];
1158
+ const float d = (float)x[i].dm[0];
1159
+ const float m = (float)x[i].dm[1];
1058
1160
  float sum = 0.f;
1059
1161
  for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
1060
1162
  sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2345
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2346
2448
  }
2347
2449
 
2348
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2349
2451
  }
2350
2452
 
2351
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2431
2533
  #pragma unroll
2432
2534
  for (int i = 0; i < QR2_K; ++ i) {
2433
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2434
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2435
2537
  }
2436
2538
 
2437
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2550
2652
  #pragma unroll
2551
2653
  for (int i = 0; i < QR3_K; ++i) {
2552
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2553
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2554
2656
  }
2555
2657
 
2556
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2719
2821
 
2720
2822
  for (int i = 0; i < QR4_K; ++i) {
2721
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2722
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2723
2825
 
2724
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2725
2827
  u[2*i+0] = q8[0];
@@ -2743,11 +2845,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2743
2845
  aux16[0] = a[0] & 0x0f0f;
2744
2846
  aux16[1] = (a[0] >> 4) & 0x0f0f;
2745
2847
 
2746
- const float dall = bq4_K->d[0];
2747
- const float dmin = bq4_K->d[1];
2848
+ const float dall = bq4_K->dm[0];
2849
+ const float dmin = bq4_K->dm[1];
2748
2850
 
2749
- const float d8_1 = bq8_1[0].ds.x;
2750
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2751
2853
 
2752
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2753
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2827,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2827
2929
 
2828
2930
  const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
2829
2931
 
2932
+ #if QK_K == 256
2830
2933
  x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
2934
+ #else
2935
+ x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
2936
+ #endif
2831
2937
  }
2832
2938
 
2833
2939
  #pragma unroll
@@ -2900,7 +3006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2900
3006
  #pragma unroll
2901
3007
  for (int i = 0; i < QR5_K; ++i) {
2902
3008
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2903
- d8[i] = bq8i->ds.x;
3009
+ d8[i] = __low2float(bq8i->ds);
2904
3010
 
2905
3011
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2906
3012
  u[2*i+0] = q8[0];
@@ -2918,8 +3024,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2918
3024
 
2919
3025
  const float d = bq5_K->d;
2920
3026
 
2921
- const float d8_1 = bq8_1[0].ds.x;
2922
- const float d8_2 = bq8_1[1].ds.x;
3027
+ const float d8_1 = __low2half(bq8_1[0].ds);
3028
+ const float d8_2 = __low2half(bq8_1[1].ds);
2923
3029
 
2924
3030
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2925
3031
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3017,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
3017
3123
 
3018
3124
  const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
3019
3125
 
3126
+ #if QK_K == 256
3020
3127
  x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
3128
+ #endif
3021
3129
  }
3022
3130
 
3023
3131
  #pragma unroll
@@ -3074,7 +3182,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3074
3182
  #pragma unroll
3075
3183
  for (int i = 0; i < QR6_K; ++i) {
3076
3184
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3077
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3185
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3078
3186
  }
3079
3187
 
3080
3188
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3242,7 +3350,7 @@ static __device__ __forceinline__ void mul_mat_q(
3242
3350
  *dsi_dst = *dsi_src;
3243
3351
  } else {
3244
3352
  float * dfi_dst = (float *) dsi_dst;
3245
- *dfi_dst = (*dsi_src).x;
3353
+ *dfi_dst = __low2half(*dsi_src);
3246
3354
  }
3247
3355
  }
3248
3356
 
@@ -3886,13 +3994,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3886
3994
  // rope == RoPE == rotary positional embedding
3887
3995
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3888
3996
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3889
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3997
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3890
3998
 
3891
3999
  if (col >= ncols) {
3892
4000
  return;
3893
4001
  }
3894
4002
 
3895
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
4003
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3896
4004
  const int i = row*ncols + col;
3897
4005
 
3898
4006
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3906,6 +4014,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3906
4014
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3907
4015
  }
3908
4016
 
4017
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4018
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4019
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4020
+
4021
+ if (col >= ncols) {
4022
+ return;
4023
+ }
4024
+
4025
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4026
+ const int i = row*ncols + col/2;
4027
+
4028
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4029
+ const float sin_theta = sinf(theta);
4030
+ const float cos_theta = cosf(theta);
4031
+
4032
+ const float x0 = x[i + 0];
4033
+ const float x1 = x[i + ncols/2];
4034
+
4035
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4036
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4037
+ }
4038
+
3909
4039
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3910
4040
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3911
4041
  const int half_n_dims = ncols/4;
@@ -3940,9 +4070,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3940
4070
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3941
4071
  }
3942
4072
 
3943
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4073
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4074
+ const int n_heads_log2_floor, const float m0, const float m1) {
3944
4075
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4076
+
4077
+ if (col >= ncols) {
4078
+ return;
4079
+ }
4080
+
3945
4081
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4082
+ const int i = row*ncols + col;
4083
+
4084
+ const int k = row/k_rows;
4085
+
4086
+ float m_k;
4087
+ if (k < n_heads_log2_floor) {
4088
+ m_k = powf(m0, k + 1);
4089
+ } else {
4090
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4091
+ }
4092
+
4093
+ dst[i] = col * m_k + x[i];
4094
+ }
4095
+
4096
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4097
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4098
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3946
4099
 
3947
4100
  if (col >= ncols) {
3948
4101
  return;
@@ -3955,24 +4108,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3955
4108
 
3956
4109
  // the CUDA soft max implementation differs from the CPU implementation
3957
4110
  // instead of doubles floats are used
3958
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3959
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3960
4111
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3961
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3962
- const int block_size = blockDim.x;
3963
- const int tid = threadIdx.x;
4112
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4113
+ const int block_size = blockDim.y;
4114
+ const int tid = threadIdx.y;
3964
4115
 
3965
- float tmp = 0.0;
4116
+ float max_val = -INFINITY;
3966
4117
 
3967
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3968
- const int col = block_start + tid;
4118
+ for (int col = tid; col < ncols; col += block_size) {
4119
+ const int i = row*ncols + col;
4120
+ max_val = max(max_val, x[i]);
4121
+ }
3969
4122
 
3970
- if (col >= ncols) {
3971
- break;
3972
- }
4123
+ // find the max value in the block
4124
+ #pragma unroll
4125
+ for (int mask = 16; mask > 0; mask >>= 1) {
4126
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4127
+ }
4128
+
4129
+ float tmp = 0.f;
3973
4130
 
4131
+ for (int col = tid; col < ncols; col += block_size) {
3974
4132
  const int i = row*ncols + col;
3975
- const float val = expf(x[i]);
4133
+ const float val = expf(x[i] - max_val);
3976
4134
  tmp += val;
3977
4135
  dst[i] = val;
3978
4136
  }
@@ -3983,15 +4141,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3983
4141
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3984
4142
  }
3985
4143
 
3986
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3987
- const int col = block_start + tid;
3988
-
3989
- if (col >= ncols) {
3990
- break;
3991
- }
4144
+ const float inv_tmp = 1.f / tmp;
3992
4145
 
4146
+ for (int col = tid; col < ncols; col += block_size) {
3993
4147
  const int i = row*ncols + col;
3994
- dst[i] /= tmp;
4148
+ dst[i] *= inv_tmp;
3995
4149
  }
3996
4150
  }
3997
4151
 
@@ -4561,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4561
4715
  const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
4562
4716
  const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
4563
4717
 
4718
+ #if QK_K == 256
4719
+
4564
4720
  int id;
4565
4721
  CUDA_CHECK(cudaGetDevice(&id));
4566
4722
  const int compute_capability = g_compute_capabilities[id];
@@ -4592,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
4592
4748
  mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
4593
4749
  (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4594
4750
  }
4751
+ #endif
4595
4752
  }
4596
4753
 
4597
4754
  static void ggml_mul_mat_q4_K_q8_1_cuda(
@@ -4751,13 +4908,22 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4751
4908
 
4752
4909
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4753
4910
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4754
- GGML_ASSERT(nrows % 2 == 0);
4755
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4911
+ GGML_ASSERT(ncols % 2 == 0);
4912
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4756
4913
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4757
- const dim3 block_nums(num_blocks_x, nrows, 1);
4914
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4758
4915
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4759
4916
  }
4760
4917
 
4918
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4919
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4920
+ GGML_ASSERT(ncols % 2 == 0);
4921
+ const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
4922
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4923
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4924
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4925
+ }
4926
+
4761
4927
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4762
4928
  GGML_ASSERT(nrows % 4 == 0);
4763
4929
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4766,16 +4932,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4766
4932
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4767
4933
  }
4768
4934
 
4935
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4936
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4937
+ const float m1, cudaStream_t stream) {
4938
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4939
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4940
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4941
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4942
+ }
4943
+
4769
4944
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4770
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4945
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4771
4946
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4772
- const dim3 block_nums(block_num_x, nrows_x, 1);
4947
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4773
4948
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4774
4949
  }
4775
4950
 
4776
4951
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4777
- const dim3 block_dims(WARP_SIZE, 1, 1);
4778
- const dim3 block_nums(1, nrows_x, 1);
4952
+ const dim3 block_dims(1, WARP_SIZE, 1);
4953
+ const dim3 block_nums(nrows_x, 1, 1);
4779
4954
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4780
4955
  }
4781
4956
 
@@ -4880,10 +5055,18 @@ void ggml_init_cublas() {
4880
5055
  static bool initialized = false;
4881
5056
 
4882
5057
  if (!initialized) {
5058
+
5059
+ #ifdef __HIP_PLATFORM_AMD__
5060
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5061
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5062
+ rocblas_initialize();
5063
+ CUDA_CHECK(cudaDeviceSynchronize());
5064
+ #endif
5065
+
4883
5066
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4884
5067
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4885
5068
  int64_t total_vram = 0;
4886
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5069
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4887
5070
  for (int id = 0; id < g_device_count; ++id) {
4888
5071
  cudaDeviceProp prop;
4889
5072
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5481,7 +5664,8 @@ inline void ggml_cuda_op_rope(
5481
5664
 
5482
5665
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5483
5666
 
5484
- const bool is_glm = mode & 4;
5667
+ const bool is_neox = mode & 2;
5668
+ const bool is_glm = mode & 4;
5485
5669
 
5486
5670
  // compute
5487
5671
  if (is_glm) {
@@ -5489,6 +5673,10 @@ inline void ggml_cuda_op_rope(
5489
5673
  const float id_p = min(p, n_ctx - 2.f);
5490
5674
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5491
5675
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5676
+ } else if (is_neox) {
5677
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5678
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5679
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5492
5680
  } else {
5493
5681
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5494
5682
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5501,6 +5689,41 @@ inline void ggml_cuda_op_rope(
5501
5689
  (void) i1;
5502
5690
  }
5503
5691
 
5692
+ inline void ggml_cuda_op_alibi(
5693
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5694
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5695
+ cudaStream_t & cudaStream_main){
5696
+
5697
+ GGML_ASSERT(src0_ddf_i != nullptr);
5698
+ GGML_ASSERT(dst_ddf_i != nullptr);
5699
+
5700
+ const int64_t ne00 = src0->ne[0];
5701
+ const int64_t ne01 = src0->ne[1];
5702
+ const int64_t ne02 = src0->ne[2];
5703
+ const int64_t i01_diff = i01_high - i01_low;
5704
+
5705
+ const int n_past = ((int32_t *) dst->op_params)[0];
5706
+ const int n_head = ((int32_t *) dst->op_params)[1];
5707
+ float max_bias;
5708
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5709
+
5710
+ GGML_ASSERT(ne01 + n_past == ne00);
5711
+ GGML_ASSERT(n_head == ne02);
5712
+
5713
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5714
+
5715
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5716
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5717
+
5718
+ // compute
5719
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5720
+
5721
+ (void) src1;
5722
+ (void) src0_ddq_i;
5723
+ (void) src1_ddf_i;
5724
+ (void) i1;
5725
+ }
5726
+
5504
5727
  inline void ggml_cuda_op_diag_mask_inf(
5505
5728
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5506
5729
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -6115,12 +6338,19 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
6115
6338
 
6116
6339
  void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6117
6340
  GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6341
+ GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
6118
6342
 
6119
6343
  const int mode = ((int32_t *) dst->op_params)[2];
6120
6344
  const bool is_glm = mode & 4;
6345
+
6121
6346
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6122
6347
  }
6123
6348
 
6349
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6350
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6351
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6352
+ }
6353
+
6124
6354
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6125
6355
  (void) src0;
6126
6356
  (void) src1;
@@ -6240,7 +6470,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6240
6470
  return extra;
6241
6471
  }
6242
6472
 
6243
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6473
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6244
6474
  if (scratch && g_scratch_size == 0) {
6245
6475
  return;
6246
6476
  }
@@ -6249,14 +6479,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6249
6479
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6250
6480
  const ggml_op src0_op = tensor->src[0]->op;
6251
6481
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6252
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6482
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6253
6483
  }
6254
6484
  }
6255
6485
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6256
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6486
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6257
6487
  }
6258
6488
 
6259
6489
  tensor->backend = GGML_BACKEND_GPU;
6490
+
6491
+ if (scratch && no_alloc) {
6492
+ return;
6493
+ }
6494
+
6260
6495
  struct ggml_tensor_extra_gpu * extra;
6261
6496
 
6262
6497
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6308,16 +6543,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6308
6543
  tensor->extra = extra;
6309
6544
  }
6310
6545
 
6546
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6547
+ if (g_scratch_size == 0) {
6548
+ return;
6549
+ }
6550
+ if (g_scratch_buffer == nullptr) {
6551
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6552
+ }
6553
+
6554
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6555
+
6556
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6557
+ tensor->op == GGML_OP_VIEW;
6558
+
6559
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6560
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6561
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6562
+ size_t view_offset = 0;
6563
+ if (tensor->op == GGML_OP_VIEW) {
6564
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6565
+ }
6566
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6567
+ } else {
6568
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6569
+ }
6570
+
6571
+ tensor->extra = extra;
6572
+ }
6573
+
6311
6574
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6312
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6575
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6576
+ }
6577
+
6578
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6579
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6313
6580
  }
6314
6581
 
6315
6582
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6316
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6583
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6317
6584
  }
6318
6585
 
6319
6586
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6320
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6587
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6321
6588
  }
6322
6589
 
6323
6590
  void ggml_cuda_set_main_device(int main_device) {
@@ -6456,6 +6723,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6456
6723
  }
6457
6724
  func = ggml_cuda_rope;
6458
6725
  break;
6726
+ case GGML_OP_ALIBI:
6727
+ if (!any_on_device) {
6728
+ return false;
6729
+ }
6730
+ func = ggml_cuda_alibi;
6731
+ break;
6459
6732
  default:
6460
6733
  return false;
6461
6734
  }