llama_cpp 0.3.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -205,11 +306,11 @@ typedef struct {
|
|
205
306
|
#define QI4_K (QK_K / (4*QR4_K))
|
206
307
|
#ifdef GGML_QKK_64
|
207
308
|
typedef struct {
|
208
|
-
half
|
309
|
+
half dm[2]; // super-block scales/mins
|
209
310
|
uint8_t scales[2]; // 4-bit block scales/mins
|
210
311
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
211
312
|
} block_q4_K;
|
212
|
-
static_assert(sizeof(block_q4_K) ==
|
313
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
213
314
|
#else
|
214
315
|
typedef struct {
|
215
316
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -635,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
635
737
|
const int tid = threadIdx.x;
|
636
738
|
const uint8_t * q = x[i].qs;
|
637
739
|
float * y = yy + i*QK_K;
|
638
|
-
const float d = (float)x[i].
|
639
|
-
const float m = (float)x[i].
|
740
|
+
const float d = (float)x[i].dm[0];
|
741
|
+
const float m = (float)x[i].dm[1];
|
640
742
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
641
743
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
642
744
|
#endif
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1053,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1053
1155
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1054
1156
|
aux16[0] = a[0] & 0x0f0f;
|
1055
1157
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1056
|
-
const float d = (float)x[i].
|
1057
|
-
const float m = (float)x[i].
|
1158
|
+
const float d = (float)x[i].dm[0];
|
1159
|
+
const float m = (float)x[i].dm[1];
|
1058
1160
|
float sum = 0.f;
|
1059
1161
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1060
1162
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2345
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2346
2448
|
}
|
2347
2449
|
|
2348
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2349
2451
|
}
|
2350
2452
|
|
2351
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2431
2533
|
#pragma unroll
|
2432
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2433
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2434
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2435
2537
|
}
|
2436
2538
|
|
2437
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2550
2652
|
#pragma unroll
|
2551
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2552
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2553
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2554
2656
|
}
|
2555
2657
|
|
2556
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2719
2821
|
|
2720
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2721
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2722
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2723
2825
|
|
2724
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2725
2827
|
u[2*i+0] = q8[0];
|
@@ -2743,11 +2845,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2743
2845
|
aux16[0] = a[0] & 0x0f0f;
|
2744
2846
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2745
2847
|
|
2746
|
-
const float dall = bq4_K->
|
2747
|
-
const float dmin = bq4_K->
|
2848
|
+
const float dall = bq4_K->dm[0];
|
2849
|
+
const float dmin = bq4_K->dm[1];
|
2748
2850
|
|
2749
|
-
const float d8_1 = bq8_1[0].ds
|
2750
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2751
2853
|
|
2752
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2753
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2827,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2827
2929
|
|
2828
2930
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2829
2931
|
|
2932
|
+
#if QK_K == 256
|
2830
2933
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2934
|
+
#else
|
2935
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2936
|
+
#endif
|
2831
2937
|
}
|
2832
2938
|
|
2833
2939
|
#pragma unroll
|
@@ -2900,7 +3006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2900
3006
|
#pragma unroll
|
2901
3007
|
for (int i = 0; i < QR5_K; ++i) {
|
2902
3008
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2903
|
-
d8[i] = bq8i->ds
|
3009
|
+
d8[i] = __low2float(bq8i->ds);
|
2904
3010
|
|
2905
3011
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2906
3012
|
u[2*i+0] = q8[0];
|
@@ -2918,8 +3024,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2918
3024
|
|
2919
3025
|
const float d = bq5_K->d;
|
2920
3026
|
|
2921
|
-
const float d8_1 = bq8_1[0].ds
|
2922
|
-
const float d8_2 = bq8_1[1].ds
|
3027
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3028
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2923
3029
|
|
2924
3030
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2925
3031
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -3017,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3017
3123
|
|
3018
3124
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3019
3125
|
|
3126
|
+
#if QK_K == 256
|
3020
3127
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3128
|
+
#endif
|
3021
3129
|
}
|
3022
3130
|
|
3023
3131
|
#pragma unroll
|
@@ -3074,7 +3182,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3074
3182
|
#pragma unroll
|
3075
3183
|
for (int i = 0; i < QR6_K; ++i) {
|
3076
3184
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3077
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3185
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3078
3186
|
}
|
3079
3187
|
|
3080
3188
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3242,7 +3350,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3242
3350
|
*dsi_dst = *dsi_src;
|
3243
3351
|
} else {
|
3244
3352
|
float * dfi_dst = (float *) dsi_dst;
|
3245
|
-
*dfi_dst = (*dsi_src)
|
3353
|
+
*dfi_dst = __low2half(*dsi_src);
|
3246
3354
|
}
|
3247
3355
|
}
|
3248
3356
|
|
@@ -3886,13 +3994,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3886
3994
|
// rope == RoPE == rotary positional embedding
|
3887
3995
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3888
3996
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3889
|
-
const int col = 2*(blockDim.
|
3997
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3890
3998
|
|
3891
3999
|
if (col >= ncols) {
|
3892
4000
|
return;
|
3893
4001
|
}
|
3894
4002
|
|
3895
|
-
const int row = blockDim.
|
4003
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3896
4004
|
const int i = row*ncols + col;
|
3897
4005
|
|
3898
4006
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3906,6 +4014,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3906
4014
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3907
4015
|
}
|
3908
4016
|
|
4017
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4018
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4019
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4020
|
+
|
4021
|
+
if (col >= ncols) {
|
4022
|
+
return;
|
4023
|
+
}
|
4024
|
+
|
4025
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4026
|
+
const int i = row*ncols + col/2;
|
4027
|
+
|
4028
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4029
|
+
const float sin_theta = sinf(theta);
|
4030
|
+
const float cos_theta = cosf(theta);
|
4031
|
+
|
4032
|
+
const float x0 = x[i + 0];
|
4033
|
+
const float x1 = x[i + ncols/2];
|
4034
|
+
|
4035
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4036
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4037
|
+
}
|
4038
|
+
|
3909
4039
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3910
4040
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3911
4041
|
const int half_n_dims = ncols/4;
|
@@ -3940,9 +4070,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3940
4070
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3941
4071
|
}
|
3942
4072
|
|
3943
|
-
static __global__ void
|
4073
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4074
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3944
4075
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4076
|
+
|
4077
|
+
if (col >= ncols) {
|
4078
|
+
return;
|
4079
|
+
}
|
4080
|
+
|
3945
4081
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4082
|
+
const int i = row*ncols + col;
|
4083
|
+
|
4084
|
+
const int k = row/k_rows;
|
4085
|
+
|
4086
|
+
float m_k;
|
4087
|
+
if (k < n_heads_log2_floor) {
|
4088
|
+
m_k = powf(m0, k + 1);
|
4089
|
+
} else {
|
4090
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4091
|
+
}
|
4092
|
+
|
4093
|
+
dst[i] = col * m_k + x[i];
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4097
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4098
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3946
4099
|
|
3947
4100
|
if (col >= ncols) {
|
3948
4101
|
return;
|
@@ -3955,24 +4108,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3955
4108
|
|
3956
4109
|
// the CUDA soft max implementation differs from the CPU implementation
|
3957
4110
|
// instead of doubles floats are used
|
3958
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3959
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3960
4111
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3961
|
-
const int row = blockDim.
|
3962
|
-
const int block_size = blockDim.
|
3963
|
-
const int tid = threadIdx.
|
4112
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4113
|
+
const int block_size = blockDim.y;
|
4114
|
+
const int tid = threadIdx.y;
|
3964
4115
|
|
3965
|
-
float
|
4116
|
+
float max_val = -INFINITY;
|
3966
4117
|
|
3967
|
-
for (int
|
3968
|
-
const int
|
4118
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4119
|
+
const int i = row*ncols + col;
|
4120
|
+
max_val = max(max_val, x[i]);
|
4121
|
+
}
|
3969
4122
|
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
4123
|
+
// find the max value in the block
|
4124
|
+
#pragma unroll
|
4125
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4126
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4127
|
+
}
|
4128
|
+
|
4129
|
+
float tmp = 0.f;
|
3973
4130
|
|
4131
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3974
4132
|
const int i = row*ncols + col;
|
3975
|
-
const float val = expf(x[i]);
|
4133
|
+
const float val = expf(x[i] - max_val);
|
3976
4134
|
tmp += val;
|
3977
4135
|
dst[i] = val;
|
3978
4136
|
}
|
@@ -3983,15 +4141,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3983
4141
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3984
4142
|
}
|
3985
4143
|
|
3986
|
-
|
3987
|
-
const int col = block_start + tid;
|
3988
|
-
|
3989
|
-
if (col >= ncols) {
|
3990
|
-
break;
|
3991
|
-
}
|
4144
|
+
const float inv_tmp = 1.f / tmp;
|
3992
4145
|
|
4146
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3993
4147
|
const int i = row*ncols + col;
|
3994
|
-
dst[i]
|
4148
|
+
dst[i] *= inv_tmp;
|
3995
4149
|
}
|
3996
4150
|
}
|
3997
4151
|
|
@@ -4561,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4561
4715
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4562
4716
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4563
4717
|
|
4718
|
+
#if QK_K == 256
|
4719
|
+
|
4564
4720
|
int id;
|
4565
4721
|
CUDA_CHECK(cudaGetDevice(&id));
|
4566
4722
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4592,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4592
4748
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
4749
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4594
4750
|
}
|
4751
|
+
#endif
|
4595
4752
|
}
|
4596
4753
|
|
4597
4754
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4751,13 +4908,22 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4751
4908
|
|
4752
4909
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4753
4910
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4754
|
-
GGML_ASSERT(
|
4755
|
-
const dim3 block_dims(
|
4911
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4912
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4756
4913
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4757
|
-
const dim3 block_nums(
|
4914
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4758
4915
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4759
4916
|
}
|
4760
4917
|
|
4918
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4919
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4920
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4921
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4922
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4923
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4924
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4925
|
+
}
|
4926
|
+
|
4761
4927
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4762
4928
|
GGML_ASSERT(nrows % 4 == 0);
|
4763
4929
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4766,16 +4932,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4766
4932
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4767
4933
|
}
|
4768
4934
|
|
4935
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4936
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4937
|
+
const float m1, cudaStream_t stream) {
|
4938
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4939
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4940
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4941
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4942
|
+
}
|
4943
|
+
|
4769
4944
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4770
|
-
const dim3 block_dims(
|
4945
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4771
4946
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4772
|
-
const dim3 block_nums(
|
4947
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4773
4948
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4774
4949
|
}
|
4775
4950
|
|
4776
4951
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4777
|
-
const dim3 block_dims(
|
4778
|
-
const dim3 block_nums(
|
4952
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4953
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4779
4954
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4780
4955
|
}
|
4781
4956
|
|
@@ -4880,10 +5055,18 @@ void ggml_init_cublas() {
|
|
4880
5055
|
static bool initialized = false;
|
4881
5056
|
|
4882
5057
|
if (!initialized) {
|
5058
|
+
|
5059
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5060
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5061
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5062
|
+
rocblas_initialize();
|
5063
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5064
|
+
#endif
|
5065
|
+
|
4883
5066
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4884
5067
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4885
5068
|
int64_t total_vram = 0;
|
4886
|
-
fprintf(stderr, "%s: found %d
|
5069
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4887
5070
|
for (int id = 0; id < g_device_count; ++id) {
|
4888
5071
|
cudaDeviceProp prop;
|
4889
5072
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5481,7 +5664,8 @@ inline void ggml_cuda_op_rope(
|
|
5481
5664
|
|
5482
5665
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5483
5666
|
|
5484
|
-
const bool
|
5667
|
+
const bool is_neox = mode & 2;
|
5668
|
+
const bool is_glm = mode & 4;
|
5485
5669
|
|
5486
5670
|
// compute
|
5487
5671
|
if (is_glm) {
|
@@ -5489,6 +5673,10 @@ inline void ggml_cuda_op_rope(
|
|
5489
5673
|
const float id_p = min(p, n_ctx - 2.f);
|
5490
5674
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5491
5675
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5676
|
+
} else if (is_neox) {
|
5677
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5678
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5679
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5492
5680
|
} else {
|
5493
5681
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5494
5682
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5501,6 +5689,41 @@ inline void ggml_cuda_op_rope(
|
|
5501
5689
|
(void) i1;
|
5502
5690
|
}
|
5503
5691
|
|
5692
|
+
inline void ggml_cuda_op_alibi(
|
5693
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5694
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5695
|
+
cudaStream_t & cudaStream_main){
|
5696
|
+
|
5697
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5698
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5699
|
+
|
5700
|
+
const int64_t ne00 = src0->ne[0];
|
5701
|
+
const int64_t ne01 = src0->ne[1];
|
5702
|
+
const int64_t ne02 = src0->ne[2];
|
5703
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5704
|
+
|
5705
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5706
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5707
|
+
float max_bias;
|
5708
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5709
|
+
|
5710
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5711
|
+
GGML_ASSERT(n_head == ne02);
|
5712
|
+
|
5713
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5714
|
+
|
5715
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5716
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5717
|
+
|
5718
|
+
// compute
|
5719
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5720
|
+
|
5721
|
+
(void) src1;
|
5722
|
+
(void) src0_ddq_i;
|
5723
|
+
(void) src1_ddf_i;
|
5724
|
+
(void) i1;
|
5725
|
+
}
|
5726
|
+
|
5504
5727
|
inline void ggml_cuda_op_diag_mask_inf(
|
5505
5728
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5506
5729
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -6115,12 +6338,19 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6115
6338
|
|
6116
6339
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6117
6340
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6118
6342
|
|
6119
6343
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6120
6344
|
const bool is_glm = mode & 4;
|
6345
|
+
|
6121
6346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6122
6347
|
}
|
6123
6348
|
|
6349
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6350
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6351
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6352
|
+
}
|
6353
|
+
|
6124
6354
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6125
6355
|
(void) src0;
|
6126
6356
|
(void) src1;
|
@@ -6240,7 +6470,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6240
6470
|
return extra;
|
6241
6471
|
}
|
6242
6472
|
|
6243
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6473
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6244
6474
|
if (scratch && g_scratch_size == 0) {
|
6245
6475
|
return;
|
6246
6476
|
}
|
@@ -6249,14 +6479,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6249
6479
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6250
6480
|
const ggml_op src0_op = tensor->src[0]->op;
|
6251
6481
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6252
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6482
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6253
6483
|
}
|
6254
6484
|
}
|
6255
6485
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6256
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6486
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6257
6487
|
}
|
6258
6488
|
|
6259
6489
|
tensor->backend = GGML_BACKEND_GPU;
|
6490
|
+
|
6491
|
+
if (scratch && no_alloc) {
|
6492
|
+
return;
|
6493
|
+
}
|
6494
|
+
|
6260
6495
|
struct ggml_tensor_extra_gpu * extra;
|
6261
6496
|
|
6262
6497
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6308,16 +6543,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6308
6543
|
tensor->extra = extra;
|
6309
6544
|
}
|
6310
6545
|
|
6546
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6547
|
+
if (g_scratch_size == 0) {
|
6548
|
+
return;
|
6549
|
+
}
|
6550
|
+
if (g_scratch_buffer == nullptr) {
|
6551
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6552
|
+
}
|
6553
|
+
|
6554
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6555
|
+
|
6556
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6557
|
+
tensor->op == GGML_OP_VIEW;
|
6558
|
+
|
6559
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6560
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6561
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6562
|
+
size_t view_offset = 0;
|
6563
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6564
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6565
|
+
}
|
6566
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6567
|
+
} else {
|
6568
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6569
|
+
}
|
6570
|
+
|
6571
|
+
tensor->extra = extra;
|
6572
|
+
}
|
6573
|
+
|
6311
6574
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6312
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6576
|
+
}
|
6577
|
+
|
6578
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6579
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6313
6580
|
}
|
6314
6581
|
|
6315
6582
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6316
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6583
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6317
6584
|
}
|
6318
6585
|
|
6319
6586
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6320
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6587
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6321
6588
|
}
|
6322
6589
|
|
6323
6590
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6456,6 +6723,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6456
6723
|
}
|
6457
6724
|
func = ggml_cuda_rope;
|
6458
6725
|
break;
|
6726
|
+
case GGML_OP_ALIBI:
|
6727
|
+
if (!any_on_device) {
|
6728
|
+
return false;
|
6729
|
+
}
|
6730
|
+
func = ggml_cuda_alibi;
|
6731
|
+
break;
|
6459
6732
|
default:
|
6460
6733
|
return false;
|
6461
6734
|
}
|