llama_cpp 0.3.8 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2345
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2346
2448
|
}
|
2347
2449
|
|
2348
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2349
2451
|
}
|
2350
2452
|
|
2351
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2431
2533
|
#pragma unroll
|
2432
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2433
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2434
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2435
2537
|
}
|
2436
2538
|
|
2437
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2550
2652
|
#pragma unroll
|
2551
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2552
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2553
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2554
2656
|
}
|
2555
2657
|
|
2556
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2719
2821
|
|
2720
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2721
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2722
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2723
2825
|
|
2724
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2725
2827
|
u[2*i+0] = q8[0];
|
@@ -2746,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2746
2848
|
const float dall = bq4_K->d[0];
|
2747
2849
|
const float dmin = bq4_K->d[1];
|
2748
2850
|
|
2749
|
-
const float d8_1 = bq8_1[0].ds
|
2750
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2751
2853
|
|
2752
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2753
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2900,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2900
3002
|
#pragma unroll
|
2901
3003
|
for (int i = 0; i < QR5_K; ++i) {
|
2902
3004
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2903
|
-
d8[i] = bq8i->ds
|
3005
|
+
d8[i] = __low2float(bq8i->ds);
|
2904
3006
|
|
2905
3007
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2906
3008
|
u[2*i+0] = q8[0];
|
@@ -2918,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2918
3020
|
|
2919
3021
|
const float d = bq5_K->d;
|
2920
3022
|
|
2921
|
-
const float d8_1 = bq8_1[0].ds
|
2922
|
-
const float d8_2 = bq8_1[1].ds
|
3023
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3024
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2923
3025
|
|
2924
3026
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2925
3027
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -3074,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3074
3176
|
#pragma unroll
|
3075
3177
|
for (int i = 0; i < QR6_K; ++i) {
|
3076
3178
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3077
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3179
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3078
3180
|
}
|
3079
3181
|
|
3080
3182
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3242,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3242
3344
|
*dsi_dst = *dsi_src;
|
3243
3345
|
} else {
|
3244
3346
|
float * dfi_dst = (float *) dsi_dst;
|
3245
|
-
*dfi_dst = (*dsi_src)
|
3347
|
+
*dfi_dst = __low2half(*dsi_src);
|
3246
3348
|
}
|
3247
3349
|
}
|
3248
3350
|
|
@@ -3886,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3886
3988
|
// rope == RoPE == rotary positional embedding
|
3887
3989
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3888
3990
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3889
|
-
const int col = 2*(blockDim.
|
3991
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3890
3992
|
|
3891
3993
|
if (col >= ncols) {
|
3892
3994
|
return;
|
3893
3995
|
}
|
3894
3996
|
|
3895
|
-
const int row = blockDim.
|
3997
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3896
3998
|
const int i = row*ncols + col;
|
3897
3999
|
|
3898
4000
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3906,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3906
4008
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3907
4009
|
}
|
3908
4010
|
|
4011
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4012
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4013
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4014
|
+
|
4015
|
+
if (col >= ncols) {
|
4016
|
+
return;
|
4017
|
+
}
|
4018
|
+
|
4019
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4020
|
+
const int i = row*ncols + col/2;
|
4021
|
+
|
4022
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4023
|
+
const float sin_theta = sinf(theta);
|
4024
|
+
const float cos_theta = cosf(theta);
|
4025
|
+
|
4026
|
+
const float x0 = x[i + 0];
|
4027
|
+
const float x1 = x[i + ncols/2];
|
4028
|
+
|
4029
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4030
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4031
|
+
}
|
4032
|
+
|
3909
4033
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3910
4034
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3911
4035
|
const int half_n_dims = ncols/4;
|
@@ -3940,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3940
4064
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3941
4065
|
}
|
3942
4066
|
|
3943
|
-
static __global__ void
|
4067
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4068
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3944
4069
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4070
|
+
|
4071
|
+
if (col >= ncols) {
|
4072
|
+
return;
|
4073
|
+
}
|
4074
|
+
|
3945
4075
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4076
|
+
const int i = row*ncols + col;
|
4077
|
+
|
4078
|
+
const int k = row/k_rows;
|
4079
|
+
|
4080
|
+
float m_k;
|
4081
|
+
if (k < n_heads_log2_floor) {
|
4082
|
+
m_k = powf(m0, k + 1);
|
4083
|
+
} else {
|
4084
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
dst[i] = col * m_k + x[i];
|
4088
|
+
}
|
4089
|
+
|
4090
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4091
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4092
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3946
4093
|
|
3947
4094
|
if (col >= ncols) {
|
3948
4095
|
return;
|
@@ -3955,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3955
4102
|
|
3956
4103
|
// the CUDA soft max implementation differs from the CPU implementation
|
3957
4104
|
// instead of doubles floats are used
|
3958
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3959
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3960
4105
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3961
|
-
const int row = blockDim.
|
3962
|
-
const int block_size = blockDim.
|
3963
|
-
const int tid = threadIdx.
|
4106
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4107
|
+
const int block_size = blockDim.y;
|
4108
|
+
const int tid = threadIdx.y;
|
3964
4109
|
|
3965
|
-
float
|
4110
|
+
float max_val = -INFINITY;
|
3966
4111
|
|
3967
|
-
for (int
|
3968
|
-
const int
|
4112
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4113
|
+
const int i = row*ncols + col;
|
4114
|
+
max_val = max(max_val, x[i]);
|
4115
|
+
}
|
3969
4116
|
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
4117
|
+
// find the max value in the block
|
4118
|
+
#pragma unroll
|
4119
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4120
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4121
|
+
}
|
4122
|
+
|
4123
|
+
float tmp = 0.f;
|
3973
4124
|
|
4125
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3974
4126
|
const int i = row*ncols + col;
|
3975
|
-
const float val = expf(x[i]);
|
4127
|
+
const float val = expf(x[i] - max_val);
|
3976
4128
|
tmp += val;
|
3977
4129
|
dst[i] = val;
|
3978
4130
|
}
|
@@ -3983,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3983
4135
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3984
4136
|
}
|
3985
4137
|
|
3986
|
-
|
3987
|
-
const int col = block_start + tid;
|
3988
|
-
|
3989
|
-
if (col >= ncols) {
|
3990
|
-
break;
|
3991
|
-
}
|
4138
|
+
const float inv_tmp = 1.f / tmp;
|
3992
4139
|
|
4140
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3993
4141
|
const int i = row*ncols + col;
|
3994
|
-
dst[i]
|
4142
|
+
dst[i] *= inv_tmp;
|
3995
4143
|
}
|
3996
4144
|
}
|
3997
4145
|
|
@@ -4751,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4751
4899
|
|
4752
4900
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4753
4901
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4754
|
-
GGML_ASSERT(nrows % 2 == 0);
|
4755
|
-
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1
|
4902
|
+
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
4903
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4756
4904
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4757
|
-
const dim3 block_nums(
|
4905
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4758
4906
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4759
4907
|
}
|
4760
4908
|
|
4909
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4915
|
+
}
|
4916
|
+
|
4761
4917
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4762
4918
|
GGML_ASSERT(nrows % 4 == 0);
|
4763
4919
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4766,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4766
4922
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4767
4923
|
}
|
4768
4924
|
|
4925
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4926
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4927
|
+
const float m1, cudaStream_t stream) {
|
4928
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4929
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4930
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4931
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4932
|
+
}
|
4933
|
+
|
4769
4934
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4770
|
-
const dim3 block_dims(
|
4935
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4771
4936
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4772
|
-
const dim3 block_nums(
|
4937
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4773
4938
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4774
4939
|
}
|
4775
4940
|
|
4776
4941
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4777
|
-
const dim3 block_dims(
|
4778
|
-
const dim3 block_nums(
|
4942
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4943
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4779
4944
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4780
4945
|
}
|
4781
4946
|
|
@@ -4880,10 +5045,18 @@ void ggml_init_cublas() {
|
|
4880
5045
|
static bool initialized = false;
|
4881
5046
|
|
4882
5047
|
if (!initialized) {
|
5048
|
+
|
5049
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5050
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5051
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5052
|
+
rocblas_initialize();
|
5053
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5054
|
+
#endif
|
5055
|
+
|
4883
5056
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4884
5057
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4885
5058
|
int64_t total_vram = 0;
|
4886
|
-
fprintf(stderr, "%s: found %d
|
5059
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4887
5060
|
for (int id = 0; id < g_device_count; ++id) {
|
4888
5061
|
cudaDeviceProp prop;
|
4889
5062
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5481,7 +5654,8 @@ inline void ggml_cuda_op_rope(
|
|
5481
5654
|
|
5482
5655
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5483
5656
|
|
5484
|
-
const bool
|
5657
|
+
const bool is_neox = mode & 2;
|
5658
|
+
const bool is_glm = mode & 4;
|
5485
5659
|
|
5486
5660
|
// compute
|
5487
5661
|
if (is_glm) {
|
@@ -5489,6 +5663,10 @@ inline void ggml_cuda_op_rope(
|
|
5489
5663
|
const float id_p = min(p, n_ctx - 2.f);
|
5490
5664
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5491
5665
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5666
|
+
} else if (is_neox) {
|
5667
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5668
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5669
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5492
5670
|
} else {
|
5493
5671
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5494
5672
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5501,6 +5679,41 @@ inline void ggml_cuda_op_rope(
|
|
5501
5679
|
(void) i1;
|
5502
5680
|
}
|
5503
5681
|
|
5682
|
+
inline void ggml_cuda_op_alibi(
|
5683
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5684
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5685
|
+
cudaStream_t & cudaStream_main){
|
5686
|
+
|
5687
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5688
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5689
|
+
|
5690
|
+
const int64_t ne00 = src0->ne[0];
|
5691
|
+
const int64_t ne01 = src0->ne[1];
|
5692
|
+
const int64_t ne02 = src0->ne[2];
|
5693
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5694
|
+
|
5695
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5696
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5697
|
+
float max_bias;
|
5698
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5699
|
+
|
5700
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5701
|
+
GGML_ASSERT(n_head == ne02);
|
5702
|
+
|
5703
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5704
|
+
|
5705
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5706
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5707
|
+
|
5708
|
+
// compute
|
5709
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5710
|
+
|
5711
|
+
(void) src1;
|
5712
|
+
(void) src0_ddq_i;
|
5713
|
+
(void) src1_ddf_i;
|
5714
|
+
(void) i1;
|
5715
|
+
}
|
5716
|
+
|
5504
5717
|
inline void ggml_cuda_op_diag_mask_inf(
|
5505
5718
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5506
5719
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -6121,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|
6121
6334
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6122
6335
|
}
|
6123
6336
|
|
6337
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6338
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6339
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6340
|
+
}
|
6341
|
+
|
6124
6342
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6125
6343
|
(void) src0;
|
6126
6344
|
(void) src1;
|
@@ -6240,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6240
6458
|
return extra;
|
6241
6459
|
}
|
6242
6460
|
|
6243
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6461
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6244
6462
|
if (scratch && g_scratch_size == 0) {
|
6245
6463
|
return;
|
6246
6464
|
}
|
@@ -6249,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6249
6467
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6250
6468
|
const ggml_op src0_op = tensor->src[0]->op;
|
6251
6469
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6252
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6470
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6253
6471
|
}
|
6254
6472
|
}
|
6255
6473
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6256
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6474
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6257
6475
|
}
|
6258
6476
|
|
6259
6477
|
tensor->backend = GGML_BACKEND_GPU;
|
6478
|
+
|
6479
|
+
if (scratch && no_alloc) {
|
6480
|
+
return;
|
6481
|
+
}
|
6482
|
+
|
6260
6483
|
struct ggml_tensor_extra_gpu * extra;
|
6261
6484
|
|
6262
6485
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6308,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6308
6531
|
tensor->extra = extra;
|
6309
6532
|
}
|
6310
6533
|
|
6534
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6535
|
+
if (g_scratch_size == 0) {
|
6536
|
+
return;
|
6537
|
+
}
|
6538
|
+
if (g_scratch_buffer == nullptr) {
|
6539
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6540
|
+
}
|
6541
|
+
|
6542
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6543
|
+
|
6544
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6545
|
+
tensor->op == GGML_OP_VIEW;
|
6546
|
+
|
6547
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6548
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6549
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6550
|
+
size_t view_offset = 0;
|
6551
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6552
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6553
|
+
}
|
6554
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6555
|
+
} else {
|
6556
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6557
|
+
}
|
6558
|
+
|
6559
|
+
tensor->extra = extra;
|
6560
|
+
}
|
6561
|
+
|
6311
6562
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6312
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6563
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6564
|
+
}
|
6565
|
+
|
6566
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6567
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6313
6568
|
}
|
6314
6569
|
|
6315
6570
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6316
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6571
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6317
6572
|
}
|
6318
6573
|
|
6319
6574
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6320
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6321
6576
|
}
|
6322
6577
|
|
6323
6578
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6456,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6456
6711
|
}
|
6457
6712
|
func = ggml_cuda_rope;
|
6458
6713
|
break;
|
6714
|
+
case GGML_OP_ALIBI:
|
6715
|
+
if (!any_on_device) {
|
6716
|
+
return false;
|
6717
|
+
}
|
6718
|
+
func = ggml_cuda_alibi;
|
6719
|
+
break;
|
6459
6720
|
default:
|
6460
6721
|
return false;
|
6461
6722
|
}
|