llama_cpp 0.3.8 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2345
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2346
2448
  }
2347
2449
 
2348
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2349
2451
  }
2350
2452
 
2351
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2431
2533
  #pragma unroll
2432
2534
  for (int i = 0; i < QR2_K; ++ i) {
2433
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2434
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2435
2537
  }
2436
2538
 
2437
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2550
2652
  #pragma unroll
2551
2653
  for (int i = 0; i < QR3_K; ++i) {
2552
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2553
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2554
2656
  }
2555
2657
 
2556
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2719
2821
 
2720
2822
  for (int i = 0; i < QR4_K; ++i) {
2721
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2722
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2723
2825
 
2724
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2725
2827
  u[2*i+0] = q8[0];
@@ -2746,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2746
2848
  const float dall = bq4_K->d[0];
2747
2849
  const float dmin = bq4_K->d[1];
2748
2850
 
2749
- const float d8_1 = bq8_1[0].ds.x;
2750
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2751
2853
 
2752
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2753
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2900,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2900
3002
  #pragma unroll
2901
3003
  for (int i = 0; i < QR5_K; ++i) {
2902
3004
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2903
- d8[i] = bq8i->ds.x;
3005
+ d8[i] = __low2float(bq8i->ds);
2904
3006
 
2905
3007
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2906
3008
  u[2*i+0] = q8[0];
@@ -2918,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2918
3020
 
2919
3021
  const float d = bq5_K->d;
2920
3022
 
2921
- const float d8_1 = bq8_1[0].ds.x;
2922
- const float d8_2 = bq8_1[1].ds.x;
3023
+ const float d8_1 = __low2half(bq8_1[0].ds);
3024
+ const float d8_2 = __low2half(bq8_1[1].ds);
2923
3025
 
2924
3026
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2925
3027
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3074,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3074
3176
  #pragma unroll
3075
3177
  for (int i = 0; i < QR6_K; ++i) {
3076
3178
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3077
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3179
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3078
3180
  }
3079
3181
 
3080
3182
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3242,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
3242
3344
  *dsi_dst = *dsi_src;
3243
3345
  } else {
3244
3346
  float * dfi_dst = (float *) dsi_dst;
3245
- *dfi_dst = (*dsi_src).x;
3347
+ *dfi_dst = __low2half(*dsi_src);
3246
3348
  }
3247
3349
  }
3248
3350
 
@@ -3886,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3886
3988
  // rope == RoPE == rotary positional embedding
3887
3989
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3888
3990
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3889
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3991
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3890
3992
 
3891
3993
  if (col >= ncols) {
3892
3994
  return;
3893
3995
  }
3894
3996
 
3895
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3997
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3896
3998
  const int i = row*ncols + col;
3897
3999
 
3898
4000
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3906,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3906
4008
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3907
4009
  }
3908
4010
 
4011
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4012
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4013
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4014
+
4015
+ if (col >= ncols) {
4016
+ return;
4017
+ }
4018
+
4019
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4020
+ const int i = row*ncols + col/2;
4021
+
4022
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4023
+ const float sin_theta = sinf(theta);
4024
+ const float cos_theta = cosf(theta);
4025
+
4026
+ const float x0 = x[i + 0];
4027
+ const float x1 = x[i + ncols/2];
4028
+
4029
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4030
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4031
+ }
4032
+
3909
4033
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3910
4034
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3911
4035
  const int half_n_dims = ncols/4;
@@ -3940,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3940
4064
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3941
4065
  }
3942
4066
 
3943
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4067
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4068
+ const int n_heads_log2_floor, const float m0, const float m1) {
3944
4069
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4070
+
4071
+ if (col >= ncols) {
4072
+ return;
4073
+ }
4074
+
3945
4075
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4076
+ const int i = row*ncols + col;
4077
+
4078
+ const int k = row/k_rows;
4079
+
4080
+ float m_k;
4081
+ if (k < n_heads_log2_floor) {
4082
+ m_k = powf(m0, k + 1);
4083
+ } else {
4084
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4085
+ }
4086
+
4087
+ dst[i] = col * m_k + x[i];
4088
+ }
4089
+
4090
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4091
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4092
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3946
4093
 
3947
4094
  if (col >= ncols) {
3948
4095
  return;
@@ -3955,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3955
4102
 
3956
4103
  // the CUDA soft max implementation differs from the CPU implementation
3957
4104
  // instead of doubles floats are used
3958
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3959
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3960
4105
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3961
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3962
- const int block_size = blockDim.x;
3963
- const int tid = threadIdx.x;
4106
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4107
+ const int block_size = blockDim.y;
4108
+ const int tid = threadIdx.y;
3964
4109
 
3965
- float tmp = 0.0;
4110
+ float max_val = -INFINITY;
3966
4111
 
3967
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3968
- const int col = block_start + tid;
4112
+ for (int col = tid; col < ncols; col += block_size) {
4113
+ const int i = row*ncols + col;
4114
+ max_val = max(max_val, x[i]);
4115
+ }
3969
4116
 
3970
- if (col >= ncols) {
3971
- break;
3972
- }
4117
+ // find the max value in the block
4118
+ #pragma unroll
4119
+ for (int mask = 16; mask > 0; mask >>= 1) {
4120
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4121
+ }
4122
+
4123
+ float tmp = 0.f;
3973
4124
 
4125
+ for (int col = tid; col < ncols; col += block_size) {
3974
4126
  const int i = row*ncols + col;
3975
- const float val = expf(x[i]);
4127
+ const float val = expf(x[i] - max_val);
3976
4128
  tmp += val;
3977
4129
  dst[i] = val;
3978
4130
  }
@@ -3983,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3983
4135
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3984
4136
  }
3985
4137
 
3986
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3987
- const int col = block_start + tid;
3988
-
3989
- if (col >= ncols) {
3990
- break;
3991
- }
4138
+ const float inv_tmp = 1.f / tmp;
3992
4139
 
4140
+ for (int col = tid; col < ncols; col += block_size) {
3993
4141
  const int i = row*ncols + col;
3994
- dst[i] /= tmp;
4142
+ dst[i] *= inv_tmp;
3995
4143
  }
3996
4144
  }
3997
4145
 
@@ -4751,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4751
4899
 
4752
4900
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4753
4901
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4754
- GGML_ASSERT(nrows % 2 == 0);
4755
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4902
+ GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4756
4904
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4757
- const dim3 block_nums(num_blocks_x, nrows, 1);
4905
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4758
4906
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4759
4907
  }
4760
4908
 
4909
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4912
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4914
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4915
+ }
4916
+
4761
4917
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4762
4918
  GGML_ASSERT(nrows % 4 == 0);
4763
4919
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4766,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4766
4922
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4767
4923
  }
4768
4924
 
4925
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4926
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4927
+ const float m1, cudaStream_t stream) {
4928
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4929
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4930
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4931
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4932
+ }
4933
+
4769
4934
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4770
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4935
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4771
4936
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4772
- const dim3 block_nums(block_num_x, nrows_x, 1);
4937
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4773
4938
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4774
4939
  }
4775
4940
 
4776
4941
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4777
- const dim3 block_dims(WARP_SIZE, 1, 1);
4778
- const dim3 block_nums(1, nrows_x, 1);
4942
+ const dim3 block_dims(1, WARP_SIZE, 1);
4943
+ const dim3 block_nums(nrows_x, 1, 1);
4779
4944
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4780
4945
  }
4781
4946
 
@@ -4880,10 +5045,18 @@ void ggml_init_cublas() {
4880
5045
  static bool initialized = false;
4881
5046
 
4882
5047
  if (!initialized) {
5048
+
5049
+ #ifdef __HIP_PLATFORM_AMD__
5050
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5051
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5052
+ rocblas_initialize();
5053
+ CUDA_CHECK(cudaDeviceSynchronize());
5054
+ #endif
5055
+
4883
5056
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4884
5057
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4885
5058
  int64_t total_vram = 0;
4886
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5059
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4887
5060
  for (int id = 0; id < g_device_count; ++id) {
4888
5061
  cudaDeviceProp prop;
4889
5062
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5481,7 +5654,8 @@ inline void ggml_cuda_op_rope(
5481
5654
 
5482
5655
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5483
5656
 
5484
- const bool is_glm = mode & 4;
5657
+ const bool is_neox = mode & 2;
5658
+ const bool is_glm = mode & 4;
5485
5659
 
5486
5660
  // compute
5487
5661
  if (is_glm) {
@@ -5489,6 +5663,10 @@ inline void ggml_cuda_op_rope(
5489
5663
  const float id_p = min(p, n_ctx - 2.f);
5490
5664
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5491
5665
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5666
+ } else if (is_neox) {
5667
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5668
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5669
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5492
5670
  } else {
5493
5671
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5494
5672
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5501,6 +5679,41 @@ inline void ggml_cuda_op_rope(
5501
5679
  (void) i1;
5502
5680
  }
5503
5681
 
5682
+ inline void ggml_cuda_op_alibi(
5683
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5684
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5685
+ cudaStream_t & cudaStream_main){
5686
+
5687
+ GGML_ASSERT(src0_ddf_i != nullptr);
5688
+ GGML_ASSERT(dst_ddf_i != nullptr);
5689
+
5690
+ const int64_t ne00 = src0->ne[0];
5691
+ const int64_t ne01 = src0->ne[1];
5692
+ const int64_t ne02 = src0->ne[2];
5693
+ const int64_t i01_diff = i01_high - i01_low;
5694
+
5695
+ const int n_past = ((int32_t *) dst->op_params)[0];
5696
+ const int n_head = ((int32_t *) dst->op_params)[1];
5697
+ float max_bias;
5698
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5699
+
5700
+ GGML_ASSERT(ne01 + n_past == ne00);
5701
+ GGML_ASSERT(n_head == ne02);
5702
+
5703
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5704
+
5705
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5706
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5707
+
5708
+ // compute
5709
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5710
+
5711
+ (void) src1;
5712
+ (void) src0_ddq_i;
5713
+ (void) src1_ddf_i;
5714
+ (void) i1;
5715
+ }
5716
+
5504
5717
  inline void ggml_cuda_op_diag_mask_inf(
5505
5718
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5506
5719
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -6121,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
6121
6334
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6122
6335
  }
6123
6336
 
6337
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6338
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6339
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6340
+ }
6341
+
6124
6342
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6125
6343
  (void) src0;
6126
6344
  (void) src1;
@@ -6240,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6240
6458
  return extra;
6241
6459
  }
6242
6460
 
6243
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6461
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6244
6462
  if (scratch && g_scratch_size == 0) {
6245
6463
  return;
6246
6464
  }
@@ -6249,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6249
6467
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6250
6468
  const ggml_op src0_op = tensor->src[0]->op;
6251
6469
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6252
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6470
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6253
6471
  }
6254
6472
  }
6255
6473
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6256
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6474
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6257
6475
  }
6258
6476
 
6259
6477
  tensor->backend = GGML_BACKEND_GPU;
6478
+
6479
+ if (scratch && no_alloc) {
6480
+ return;
6481
+ }
6482
+
6260
6483
  struct ggml_tensor_extra_gpu * extra;
6261
6484
 
6262
6485
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6308,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6308
6531
  tensor->extra = extra;
6309
6532
  }
6310
6533
 
6534
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6535
+ if (g_scratch_size == 0) {
6536
+ return;
6537
+ }
6538
+ if (g_scratch_buffer == nullptr) {
6539
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6540
+ }
6541
+
6542
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6543
+
6544
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6545
+ tensor->op == GGML_OP_VIEW;
6546
+
6547
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6548
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6549
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6550
+ size_t view_offset = 0;
6551
+ if (tensor->op == GGML_OP_VIEW) {
6552
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6553
+ }
6554
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6555
+ } else {
6556
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6557
+ }
6558
+
6559
+ tensor->extra = extra;
6560
+ }
6561
+
6311
6562
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6312
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6563
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6564
+ }
6565
+
6566
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6567
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6313
6568
  }
6314
6569
 
6315
6570
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6316
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6571
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6317
6572
  }
6318
6573
 
6319
6574
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6320
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6575
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6321
6576
  }
6322
6577
 
6323
6578
  void ggml_cuda_set_main_device(int main_device) {
@@ -6456,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6456
6711
  }
6457
6712
  func = ggml_cuda_rope;
6458
6713
  break;
6714
+ case GGML_OP_ALIBI:
6715
+ if (!any_on_device) {
6716
+ return false;
6717
+ }
6718
+ func = ggml_cuda_alibi;
6719
+ break;
6459
6720
  default:
6460
6721
  return false;
6461
6722
  }