llama_cpp 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,116 @@
6
6
  #include <atomic>
7
7
  #include <assert.h>
8
8
 
9
+ #if defined(GGML_USE_HIPBLAS)
10
+ #include <hip/hip_runtime.h>
11
+ #include <hipblas/hipblas.h>
12
+ #include <hip/hip_fp16.h>
13
+ #ifdef __HIP_PLATFORM_AMD__
14
+ // for rocblas_initialize()
15
+ #include "rocblas/rocblas.h"
16
+ #endif
17
+ #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
18
+ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
19
+ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
20
+ #define CUBLAS_OP_N HIPBLAS_OP_N
21
+ #define CUBLAS_OP_T HIPBLAS_OP_T
22
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
23
+ #define CUBLAS_TF32_TENSOR_OP_MATH 0
24
+ #define CUDA_R_16F HIPBLAS_R_16F
25
+ #define CUDA_R_32F HIPBLAS_R_32F
26
+ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
27
+ #define cublasCreate hipblasCreate
28
+ #define cublasGemmEx hipblasGemmEx
29
+ #define cublasHandle_t hipblasHandle_t
30
+ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
31
+ #define cublasSetStream hipblasSetStream
32
+ #define cublasSgemm hipblasSgemm
33
+ #define cublasStatus_t hipblasStatus_t
34
+ #define cudaDeviceProp hipDeviceProp_t
35
+ #define cudaDeviceSynchronize hipDeviceSynchronize
36
+ #define cudaError_t hipError_t
37
+ #define cudaEventCreateWithFlags hipEventCreateWithFlags
38
+ #define cudaEventDisableTiming hipEventDisableTiming
39
+ #define cudaEventRecord hipEventRecord
40
+ #define cudaEvent_t hipEvent_t
41
+ #define cudaEventDestroy hipEventDestroy
42
+ #define cudaFree hipFree
43
+ #define cudaFreeHost hipHostFree
44
+ #define cudaGetDevice hipGetDevice
45
+ #define cudaGetDeviceCount hipGetDeviceCount
46
+ #define cudaGetDeviceProperties hipGetDeviceProperties
47
+ #define cudaGetErrorString hipGetErrorString
48
+ #define cudaGetLastError hipGetLastError
49
+ #define cudaMalloc hipMalloc
50
+ #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
51
+ #define cudaMemcpy hipMemcpy
52
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
53
+ #define cudaMemcpyAsync hipMemcpyAsync
54
+ #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
55
+ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
56
+ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
57
+ #define cudaMemcpyKind hipMemcpyKind
58
+ #define cudaMemset hipMemset
59
+ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
60
+ #define cudaSetDevice hipSetDevice
61
+ #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
62
+ #define cudaStreamNonBlocking hipStreamNonBlocking
63
+ #define cudaStreamSynchronize hipStreamSynchronize
64
+ #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
65
+ #define cudaStream_t hipStream_t
66
+ #define cudaSuccess hipSuccess
67
+ #else
9
68
  #include <cuda_runtime.h>
10
69
  #include <cublas_v2.h>
11
70
  #include <cuda_fp16.h>
71
+ #endif
12
72
 
13
73
  #include "ggml-cuda.h"
14
74
  #include "ggml.h"
15
75
 
16
76
  #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
77
+ #ifndef CC_TURING
17
78
  #define CC_TURING 700
79
+ #endif
80
+
81
+ #if defined(GGML_USE_HIPBLAS)
82
+ #define __CUDA_ARCH__ 1300
83
+
84
+ typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
85
+ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
86
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
87
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
88
+ const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
89
+ return reinterpret_cast<const int&>(c);
90
+ }
91
+
92
+ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
93
+ #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
94
+ c = __builtin_amdgcn_sdot4(a, b, c, false);
95
+ #elif defined(__gfx1100__)
96
+ c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
97
+ #elif defined(__gfx1010__) || defined(__gfx900__)
98
+ int tmp1;
99
+ int tmp2;
100
+ asm("\n \
101
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
102
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
103
+ v_add3_u32 %0, %1, %2, %0 \n \
104
+ v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
105
+ v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
106
+ v_add3_u32 %0, %1, %2, %0 \n \
107
+ "
108
+ : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
109
+ : "v"(a), "v"(b)
110
+ );
111
+ #else
112
+ const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
113
+ const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
114
+ c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
115
+ #endif
116
+ return c;
117
+ }
118
+ #endif
18
119
 
19
120
  #if defined(_MSC_VER)
20
121
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
259
360
  #define CUDA_CPY_BLOCK_SIZE 32
260
361
  #define CUDA_SCALE_BLOCK_SIZE 256
261
362
  #define CUDA_ROPE_BLOCK_SIZE 256
363
+ #define CUDA_ALIBI_BLOCK_SIZE 32
262
364
  #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
263
365
  #define CUDA_QUANTIZE_BLOCK_SIZE 256
264
366
  #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@@ -286,7 +388,7 @@ static int g_device_count = -1;
286
388
  static int g_main_device = 0;
287
389
  static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
288
390
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
289
- static bool g_mul_mat_q = false;
391
+ static bool g_mul_mat_q = true;
290
392
 
291
393
  static void * g_scratch_buffer = nullptr;
292
394
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
423
525
  static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
424
526
  const block_q4_1 * x = (const block_q4_1 *) vx;
425
527
 
426
- const dfloat d = x[ib].dm.x;
427
- const dfloat m = x[ib].dm.y;
528
+ const dfloat d = __low2half(x[ib].dm);
529
+ const dfloat m = __high2half(x[ib].dm);
428
530
 
429
531
  const int vui = x[ib].qs[iqs];
430
532
 
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
466
568
  static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
467
569
  const block_q5_1 * x = (const block_q5_1 *) vx;
468
570
 
469
- const dfloat d = x[ib].dm.x;
470
- const dfloat m = x[ib].dm.y;
571
+ const dfloat d = __low2half(x[ib].dm);
572
+ const dfloat m = __high2half(x[ib].dm);
471
573
 
472
574
  uint32_t qh;
473
575
  memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
519
621
  const uint8_t q = x[i].qs[32*n + l];
520
622
  float * y = yy + i*QK_K + 128*n;
521
623
 
522
- float dall = x[i].dm.x;
523
- float dmin = x[i].dm.y;
624
+ float dall = __low2half(x[i].dm);
625
+ float dmin = __high2half(x[i].dm);
524
626
  y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
525
627
  y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
526
628
  y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
530
632
  const int il = tid%16; // 0...15
531
633
  const uint8_t q = x[i].qs[il] >> (2*is);
532
634
  float * y = yy + i*QK_K + 16*is + il;
533
- float dall = x[i].dm.x;
534
- float dmin = x[i].dm.y;
635
+ float dall = __low2half(x[i].dm);
636
+ float dmin = __high2half(x[i].dm);
535
637
  y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
536
638
  y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
537
639
  #endif
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
617
719
 
618
720
  float * y = yy + i*QK_K + 64*il + n*ir;
619
721
 
620
- const float dall = x[i].dm.x;
621
- const float dmin = x[i].dm.y;
722
+ const float dall = __low2half(x[i].dm);
723
+ const float dmin = __high2half(x[i].dm);
622
724
 
623
725
  const uint8_t * q = x[i].qs + 32*il + n*ir;
624
726
 
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
656
758
 
657
759
  float * y = yy + i*QK_K + 64*il + 2*ir;
658
760
 
659
- const float dall = x[i].dm.x;
660
- const float dmin = x[i].dm.y;
761
+ const float dall = __low2half(x[i].dm);
762
+ const float dmin = __high2half(x[i].dm);
661
763
 
662
764
  const uint8_t * ql = x[i].qs + 32*il + 2*ir;
663
765
  const uint8_t * qh = x[i].qh + 2*ir;
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
769
871
  const float * y = yy + i * QK_K + y_offset;
770
872
  const uint8_t * q = x[i].qs + q_offset;
771
873
 
772
- const float dall = x[i].dm.x;
773
- const float dmin = x[i].dm.y;
874
+ const float dall = __low2half(x[i].dm);
875
+ const float dmin = __high2half(x[i].dm);
774
876
 
775
877
  const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
776
878
  aux[0] = a[0] & 0x0f0f0f0f;
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
990
1092
  const float * y1 = yy + i*QK_K + y_offset;
991
1093
  const float * y2 = y1 + 128;
992
1094
 
993
- const float dall = x[i].dm.x;
994
- const float dmin = x[i].dm.y;
1095
+ const float dall = __low2half(x[i].dm);
1096
+ const float dmin = __high2half(x[i].dm);
995
1097
 
996
1098
  const uint16_t * a = (const uint16_t *)x[i].scales;
997
1099
  aux[0] = a[im+0] & kmask1;
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1123
1225
  const float * y1 = yy + i*QK_K + y_offset;
1124
1226
  const float * y2 = y1 + 128;
1125
1227
 
1126
- const float dall = x[i].dm.x;
1127
- const float dmin = x[i].dm.y;
1228
+ const float dall = __low2half(x[i].dm);
1229
+ const float dmin = __high2half(x[i].dm);
1128
1230
 
1129
1231
  const uint16_t * a = (const uint16_t *)x[i].scales;
1130
1232
  aux[0] = a[im+0] & kmask1;
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
1347
1449
  return;
1348
1450
  }
1349
1451
 
1350
- y[ib].ds.x = d;
1351
- y[ib].ds.y = sum;
1452
+ reinterpret_cast<half&>(y[ib].ds.x) = d;
1453
+ reinterpret_cast<half&>(y[ib].ds.y) = sum;
1352
1454
  }
1353
1455
 
1354
1456
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
2345
2447
  u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
2346
2448
  }
2347
2449
 
2348
- return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
2450
+ return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
2349
2451
  }
2350
2452
 
2351
2453
  template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
2431
2533
  #pragma unroll
2432
2534
  for (int i = 0; i < QR2_K; ++ i) {
2433
2535
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2434
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2536
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2435
2537
  }
2436
2538
 
2437
2539
  return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
2550
2652
  #pragma unroll
2551
2653
  for (int i = 0; i < QR3_K; ++i) {
2552
2654
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
2553
- d8[i] = bq8_1[bq8_offset + i].ds.x;
2655
+ d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
2554
2656
  }
2555
2657
 
2556
2658
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2719
2821
 
2720
2822
  for (int i = 0; i < QR4_K; ++i) {
2721
2823
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2722
- d8[i] = bq8i->ds.x;
2824
+ d8[i] = __low2half(bq8i->ds);
2723
2825
 
2724
2826
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2725
2827
  u[2*i+0] = q8[0];
@@ -2746,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
2746
2848
  const float dall = bq4_K->d[0];
2747
2849
  const float dmin = bq4_K->d[1];
2748
2850
 
2749
- const float d8_1 = bq8_1[0].ds.x;
2750
- const float d8_2 = bq8_1[1].ds.x;
2851
+ const float d8_1 = __low2float(bq8_1[0].ds);
2852
+ const float d8_2 = __low2float(bq8_1[1].ds);
2751
2853
 
2752
2854
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2753
2855
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -2900,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2900
3002
  #pragma unroll
2901
3003
  for (int i = 0; i < QR5_K; ++i) {
2902
3004
  const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
2903
- d8[i] = bq8i->ds.x;
3005
+ d8[i] = __low2float(bq8i->ds);
2904
3006
 
2905
3007
  const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
2906
3008
  u[2*i+0] = q8[0];
@@ -2918,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
2918
3020
 
2919
3021
  const float d = bq5_K->d;
2920
3022
 
2921
- const float d8_1 = bq8_1[0].ds.x;
2922
- const float d8_2 = bq8_1[1].ds.x;
3023
+ const float d8_1 = __low2half(bq8_1[0].ds);
3024
+ const float d8_2 = __low2half(bq8_1[1].ds);
2923
3025
 
2924
3026
  const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
2925
3027
  const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@@ -3074,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
3074
3176
  #pragma unroll
3075
3177
  for (int i = 0; i < QR6_K; ++i) {
3076
3178
  u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
3077
- d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
3179
+ d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
3078
3180
  }
3079
3181
 
3080
3182
  return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@@ -3242,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
3242
3344
  *dsi_dst = *dsi_src;
3243
3345
  } else {
3244
3346
  float * dfi_dst = (float *) dsi_dst;
3245
- *dfi_dst = (*dsi_src).x;
3347
+ *dfi_dst = __low2half(*dsi_src);
3246
3348
  }
3247
3349
  }
3248
3350
 
@@ -3886,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
3886
3988
  // rope == RoPE == rotary positional embedding
3887
3989
  static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
3888
3990
  const float p_delta, const int p_delta_rows, const float theta_scale) {
3889
- const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
3991
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3890
3992
 
3891
3993
  if (col >= ncols) {
3892
3994
  return;
3893
3995
  }
3894
3996
 
3895
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3997
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3896
3998
  const int i = row*ncols + col;
3897
3999
 
3898
4000
  const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@@ -3906,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
3906
4008
  dst[i + 1] = x0*sin_theta + x1*cos_theta;
3907
4009
  }
3908
4010
 
4011
+ static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
4012
+ const float p_delta, const int p_delta_rows, const float theta_scale) {
4013
+ const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
4014
+
4015
+ if (col >= ncols) {
4016
+ return;
4017
+ }
4018
+
4019
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4020
+ const int i = row*ncols + col/2;
4021
+
4022
+ const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
4023
+ const float sin_theta = sinf(theta);
4024
+ const float cos_theta = cosf(theta);
4025
+
4026
+ const float x0 = x[i + 0];
4027
+ const float x1 = x[i + ncols/2];
4028
+
4029
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
4030
+ dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
4031
+ }
4032
+
3909
4033
  static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
3910
4034
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
3911
4035
  const int half_n_dims = ncols/4;
@@ -3940,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
3940
4064
  dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
3941
4065
  }
3942
4066
 
3943
- static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4067
+ static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
4068
+ const int n_heads_log2_floor, const float m0, const float m1) {
3944
4069
  const int col = blockDim.x*blockIdx.x + threadIdx.x;
4070
+
4071
+ if (col >= ncols) {
4072
+ return;
4073
+ }
4074
+
3945
4075
  const int row = blockDim.y*blockIdx.y + threadIdx.y;
4076
+ const int i = row*ncols + col;
4077
+
4078
+ const int k = row/k_rows;
4079
+
4080
+ float m_k;
4081
+ if (k < n_heads_log2_floor) {
4082
+ m_k = powf(m0, k + 1);
4083
+ } else {
4084
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
4085
+ }
4086
+
4087
+ dst[i] = col * m_k + x[i];
4088
+ }
4089
+
4090
+ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
4091
+ const int col = blockDim.y*blockIdx.y + threadIdx.y;
4092
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
3946
4093
 
3947
4094
  if (col >= ncols) {
3948
4095
  return;
@@ -3955,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
3955
4102
 
3956
4103
  // the CUDA soft max implementation differs from the CPU implementation
3957
4104
  // instead of doubles floats are used
3958
- // values are also not normalized to the maximum value by subtracting it in the exponential function
3959
- // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
3960
4105
  static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
3961
- const int row = blockDim.y*blockIdx.y + threadIdx.y;
3962
- const int block_size = blockDim.x;
3963
- const int tid = threadIdx.x;
4106
+ const int row = blockDim.x*blockIdx.x + threadIdx.x;
4107
+ const int block_size = blockDim.y;
4108
+ const int tid = threadIdx.y;
3964
4109
 
3965
- float tmp = 0.0;
4110
+ float max_val = -INFINITY;
3966
4111
 
3967
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3968
- const int col = block_start + tid;
4112
+ for (int col = tid; col < ncols; col += block_size) {
4113
+ const int i = row*ncols + col;
4114
+ max_val = max(max_val, x[i]);
4115
+ }
3969
4116
 
3970
- if (col >= ncols) {
3971
- break;
3972
- }
4117
+ // find the max value in the block
4118
+ #pragma unroll
4119
+ for (int mask = 16; mask > 0; mask >>= 1) {
4120
+ max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
4121
+ }
4122
+
4123
+ float tmp = 0.f;
3973
4124
 
4125
+ for (int col = tid; col < ncols; col += block_size) {
3974
4126
  const int i = row*ncols + col;
3975
- const float val = expf(x[i]);
4127
+ const float val = expf(x[i] - max_val);
3976
4128
  tmp += val;
3977
4129
  dst[i] = val;
3978
4130
  }
@@ -3983,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
3983
4135
  tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
3984
4136
  }
3985
4137
 
3986
- for (int block_start = 0; block_start < ncols; block_start += block_size) {
3987
- const int col = block_start + tid;
3988
-
3989
- if (col >= ncols) {
3990
- break;
3991
- }
4138
+ const float inv_tmp = 1.f / tmp;
3992
4139
 
4140
+ for (int col = tid; col < ncols; col += block_size) {
3993
4141
  const int i = row*ncols + col;
3994
- dst[i] /= tmp;
4142
+ dst[i] *= inv_tmp;
3995
4143
  }
3996
4144
  }
3997
4145
 
@@ -4751,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
4751
4899
 
4752
4900
  static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4753
4901
  const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4754
- GGML_ASSERT(nrows % 2 == 0);
4755
- const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
4902
+ GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
4903
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4756
4904
  const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4757
- const dim3 block_nums(num_blocks_x, nrows, 1);
4905
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4758
4906
  rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4759
4907
  }
4760
4908
 
4909
+ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4910
+ const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4911
+ const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4912
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4913
+ const dim3 block_nums(nrows, num_blocks_x, 1);
4914
+ rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4915
+ }
4916
+
4761
4917
  static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
4762
4918
  GGML_ASSERT(nrows % 4 == 0);
4763
4919
  const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -4766,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
4766
4922
  rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
4767
4923
  }
4768
4924
 
4925
+ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
4926
+ const int k_rows, const int n_heads_log2_floor, const float m0,
4927
+ const float m1, cudaStream_t stream) {
4928
+ const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
4929
+ const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
4930
+ const dim3 block_nums(num_blocks_x, nrows, 1);
4931
+ alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
4932
+ }
4933
+
4769
4934
  static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
4770
- const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
4935
+ const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
4771
4936
  const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
4772
- const dim3 block_nums(block_num_x, nrows_x, 1);
4937
+ const dim3 block_nums(nrows_x, block_num_x, 1);
4773
4938
  diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
4774
4939
  }
4775
4940
 
4776
4941
  static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
4777
- const dim3 block_dims(WARP_SIZE, 1, 1);
4778
- const dim3 block_nums(1, nrows_x, 1);
4942
+ const dim3 block_dims(1, WARP_SIZE, 1);
4943
+ const dim3 block_nums(nrows_x, 1, 1);
4779
4944
  soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
4780
4945
  }
4781
4946
 
@@ -4880,10 +5045,18 @@ void ggml_init_cublas() {
4880
5045
  static bool initialized = false;
4881
5046
 
4882
5047
  if (!initialized) {
5048
+
5049
+ #ifdef __HIP_PLATFORM_AMD__
5050
+ // Workaround for a rocBLAS bug when using multiple graphics cards:
5051
+ // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
5052
+ rocblas_initialize();
5053
+ CUDA_CHECK(cudaDeviceSynchronize());
5054
+ #endif
5055
+
4883
5056
  CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
4884
5057
  GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
4885
5058
  int64_t total_vram = 0;
4886
- fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
5059
+ fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
4887
5060
  for (int id = 0; id < g_device_count; ++id) {
4888
5061
  cudaDeviceProp prop;
4889
5062
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -5481,7 +5654,8 @@ inline void ggml_cuda_op_rope(
5481
5654
 
5482
5655
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
5483
5656
 
5484
- const bool is_glm = mode & 4;
5657
+ const bool is_neox = mode & 2;
5658
+ const bool is_glm = mode & 4;
5485
5659
 
5486
5660
  // compute
5487
5661
  if (is_glm) {
@@ -5489,6 +5663,10 @@ inline void ggml_cuda_op_rope(
5489
5663
  const float id_p = min(p, n_ctx - 2.f);
5490
5664
  const float block_p = max(p - (n_ctx - 2.f), 0.f);
5491
5665
  rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
5666
+ } else if (is_neox) {
5667
+ GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5668
+ const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5669
+ rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
5492
5670
  } else {
5493
5671
  const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5494
5672
  rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@@ -5501,6 +5679,41 @@ inline void ggml_cuda_op_rope(
5501
5679
  (void) i1;
5502
5680
  }
5503
5681
 
5682
+ inline void ggml_cuda_op_alibi(
5683
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5684
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
5685
+ cudaStream_t & cudaStream_main){
5686
+
5687
+ GGML_ASSERT(src0_ddf_i != nullptr);
5688
+ GGML_ASSERT(dst_ddf_i != nullptr);
5689
+
5690
+ const int64_t ne00 = src0->ne[0];
5691
+ const int64_t ne01 = src0->ne[1];
5692
+ const int64_t ne02 = src0->ne[2];
5693
+ const int64_t i01_diff = i01_high - i01_low;
5694
+
5695
+ const int n_past = ((int32_t *) dst->op_params)[0];
5696
+ const int n_head = ((int32_t *) dst->op_params)[1];
5697
+ float max_bias;
5698
+ memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
5699
+
5700
+ GGML_ASSERT(ne01 + n_past == ne00);
5701
+ GGML_ASSERT(n_head == ne02);
5702
+
5703
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
5704
+
5705
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
5706
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
5707
+
5708
+ // compute
5709
+ alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
5710
+
5711
+ (void) src1;
5712
+ (void) src0_ddq_i;
5713
+ (void) src1_ddf_i;
5714
+ (void) i1;
5715
+ }
5716
+
5504
5717
  inline void ggml_cuda_op_diag_mask_inf(
5505
5718
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5506
5719
  float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -6121,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
6121
6334
  ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
6122
6335
  }
6123
6336
 
6337
+ void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6338
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
6339
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
6340
+ }
6341
+
6124
6342
  void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6125
6343
  (void) src0;
6126
6344
  (void) src1;
@@ -6240,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
6240
6458
  return extra;
6241
6459
  }
6242
6460
 
6243
- void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
6461
+ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
6244
6462
  if (scratch && g_scratch_size == 0) {
6245
6463
  return;
6246
6464
  }
@@ -6249,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6249
6467
  if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
6250
6468
  const ggml_op src0_op = tensor->src[0]->op;
6251
6469
  if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
6252
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
6470
+ ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
6253
6471
  }
6254
6472
  }
6255
6473
  if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
6256
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
6474
+ ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
6257
6475
  }
6258
6476
 
6259
6477
  tensor->backend = GGML_BACKEND_GPU;
6478
+
6479
+ if (scratch && no_alloc) {
6480
+ return;
6481
+ }
6482
+
6260
6483
  struct ggml_tensor_extra_gpu * extra;
6261
6484
 
6262
6485
  const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@@ -6308,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
6308
6531
  tensor->extra = extra;
6309
6532
  }
6310
6533
 
6534
+ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
6535
+ if (g_scratch_size == 0) {
6536
+ return;
6537
+ }
6538
+ if (g_scratch_buffer == nullptr) {
6539
+ CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
6540
+ }
6541
+
6542
+ struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
6543
+
6544
+ const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
6545
+ tensor->op == GGML_OP_VIEW;
6546
+
6547
+ if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
6548
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
6549
+ char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
6550
+ size_t view_offset = 0;
6551
+ if (tensor->op == GGML_OP_VIEW) {
6552
+ memcpy(&view_offset, tensor->op_params, sizeof(size_t));
6553
+ }
6554
+ extra->data_device[g_main_device] = src0_ddc + view_offset;
6555
+ } else {
6556
+ extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
6557
+ }
6558
+
6559
+ tensor->extra = extra;
6560
+ }
6561
+
6311
6562
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
6312
- ggml_cuda_assign_buffers_impl(tensor, true, false);
6563
+ ggml_cuda_assign_buffers_impl(tensor, true, false, false);
6564
+ }
6565
+
6566
+ void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
6567
+ ggml_cuda_assign_buffers_impl(tensor, true, false, true);
6313
6568
  }
6314
6569
 
6315
6570
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
6316
- ggml_cuda_assign_buffers_impl(tensor, false, false);
6571
+ ggml_cuda_assign_buffers_impl(tensor, false, false, false);
6317
6572
  }
6318
6573
 
6319
6574
  void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
6320
- ggml_cuda_assign_buffers_impl(tensor, false, true);
6575
+ ggml_cuda_assign_buffers_impl(tensor, false, true, false);
6321
6576
  }
6322
6577
 
6323
6578
  void ggml_cuda_set_main_device(int main_device) {
@@ -6456,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
6456
6711
  }
6457
6712
  func = ggml_cuda_rope;
6458
6713
  break;
6714
+ case GGML_OP_ALIBI:
6715
+ if (!any_on_device) {
6716
+ return false;
6717
+ }
6718
+ func = ggml_cuda_alibi;
6719
+ break;
6459
6720
  default:
6460
6721
  return false;
6461
6722
  }