llama_cpp 0.3.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -205,11 +306,11 @@ typedef struct {
|
|
205
306
|
#define QI4_K (QK_K / (4*QR4_K))
|
206
307
|
#ifdef GGML_QKK_64
|
207
308
|
typedef struct {
|
208
|
-
half
|
309
|
+
half dm[2]; // super-block scales/mins
|
209
310
|
uint8_t scales[2]; // 4-bit block scales/mins
|
210
311
|
uint8_t qs[QK_K/2]; // 4--bit quants
|
211
312
|
} block_q4_K;
|
212
|
-
static_assert(sizeof(block_q4_K) ==
|
313
|
+
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
213
314
|
#else
|
214
315
|
typedef struct {
|
215
316
|
half2 dm; // super-block scale for quantized scales/mins
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -635,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
635
737
|
const int tid = threadIdx.x;
|
636
738
|
const uint8_t * q = x[i].qs;
|
637
739
|
float * y = yy + i*QK_K;
|
638
|
-
const float d = (float)x[i].
|
639
|
-
const float m = (float)x[i].
|
740
|
+
const float d = (float)x[i].dm[0];
|
741
|
+
const float m = (float)x[i].dm[1];
|
640
742
|
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
641
743
|
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
642
744
|
#endif
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1053,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
1053
1155
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1054
1156
|
aux16[0] = a[0] & 0x0f0f;
|
1055
1157
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
1056
|
-
const float d = (float)x[i].
|
1057
|
-
const float m = (float)x[i].
|
1158
|
+
const float d = (float)x[i].dm[0];
|
1159
|
+
const float m = (float)x[i].dm[1];
|
1058
1160
|
float sum = 0.f;
|
1059
1161
|
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
1060
1162
|
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2345
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2346
2448
|
}
|
2347
2449
|
|
2348
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2349
2451
|
}
|
2350
2452
|
|
2351
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2431
2533
|
#pragma unroll
|
2432
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2433
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2434
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2435
2537
|
}
|
2436
2538
|
|
2437
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2550
2652
|
#pragma unroll
|
2551
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2552
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2553
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2554
2656
|
}
|
2555
2657
|
|
2556
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2719
2821
|
|
2720
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2721
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2722
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2723
2825
|
|
2724
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2725
2827
|
u[2*i+0] = q8[0];
|
@@ -2743,11 +2845,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2743
2845
|
aux16[0] = a[0] & 0x0f0f;
|
2744
2846
|
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
2745
2847
|
|
2746
|
-
const float dall = bq4_K->
|
2747
|
-
const float dmin = bq4_K->
|
2848
|
+
const float dall = bq4_K->dm[0];
|
2849
|
+
const float dmin = bq4_K->dm[1];
|
2748
2850
|
|
2749
|
-
const float d8_1 = bq8_1[0].ds
|
2750
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2751
2853
|
|
2752
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2753
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2827,7 +2929,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2827
2929
|
|
2828
2930
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
2829
2931
|
|
2932
|
+
#if QK_K == 256
|
2830
2933
|
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
2934
|
+
#else
|
2935
|
+
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
2936
|
+
#endif
|
2831
2937
|
}
|
2832
2938
|
|
2833
2939
|
#pragma unroll
|
@@ -2900,7 +3006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2900
3006
|
#pragma unroll
|
2901
3007
|
for (int i = 0; i < QR5_K; ++i) {
|
2902
3008
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2903
|
-
d8[i] = bq8i->ds
|
3009
|
+
d8[i] = __low2float(bq8i->ds);
|
2904
3010
|
|
2905
3011
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2906
3012
|
u[2*i+0] = q8[0];
|
@@ -2918,8 +3024,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2918
3024
|
|
2919
3025
|
const float d = bq5_K->d;
|
2920
3026
|
|
2921
|
-
const float d8_1 = bq8_1[0].ds
|
2922
|
-
const float d8_2 = bq8_1[1].ds
|
3027
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3028
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2923
3029
|
|
2924
3030
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2925
3031
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -3017,7 +3123,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3017
3123
|
|
3018
3124
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
3019
3125
|
|
3126
|
+
#if QK_K == 256
|
3020
3127
|
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
3128
|
+
#endif
|
3021
3129
|
}
|
3022
3130
|
|
3023
3131
|
#pragma unroll
|
@@ -3074,7 +3182,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3074
3182
|
#pragma unroll
|
3075
3183
|
for (int i = 0; i < QR6_K; ++i) {
|
3076
3184
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3077
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3185
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3078
3186
|
}
|
3079
3187
|
|
3080
3188
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3242,7 +3350,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3242
3350
|
*dsi_dst = *dsi_src;
|
3243
3351
|
} else {
|
3244
3352
|
float * dfi_dst = (float *) dsi_dst;
|
3245
|
-
*dfi_dst = (*dsi_src)
|
3353
|
+
*dfi_dst = __low2half(*dsi_src);
|
3246
3354
|
}
|
3247
3355
|
}
|
3248
3356
|
|
@@ -3886,13 +3994,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3886
3994
|
// rope == RoPE == rotary positional embedding
|
3887
3995
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3888
3996
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3889
|
-
const int col = 2*(blockDim.
|
3997
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3890
3998
|
|
3891
3999
|
if (col >= ncols) {
|
3892
4000
|
return;
|
3893
4001
|
}
|
3894
4002
|
|
3895
|
-
const int row = blockDim.
|
4003
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3896
4004
|
const int i = row*ncols + col;
|
3897
4005
|
|
3898
4006
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3906,6 +4014,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3906
4014
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3907
4015
|
}
|
3908
4016
|
|
4017
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4018
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4019
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4020
|
+
|
4021
|
+
if (col >= ncols) {
|
4022
|
+
return;
|
4023
|
+
}
|
4024
|
+
|
4025
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4026
|
+
const int i = row*ncols + col/2;
|
4027
|
+
|
4028
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4029
|
+
const float sin_theta = sinf(theta);
|
4030
|
+
const float cos_theta = cosf(theta);
|
4031
|
+
|
4032
|
+
const float x0 = x[i + 0];
|
4033
|
+
const float x1 = x[i + ncols/2];
|
4034
|
+
|
4035
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4036
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4037
|
+
}
|
4038
|
+
|
3909
4039
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3910
4040
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3911
4041
|
const int half_n_dims = ncols/4;
|
@@ -3940,9 +4070,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3940
4070
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3941
4071
|
}
|
3942
4072
|
|
3943
|
-
static __global__ void
|
4073
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4074
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3944
4075
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4076
|
+
|
4077
|
+
if (col >= ncols) {
|
4078
|
+
return;
|
4079
|
+
}
|
4080
|
+
|
3945
4081
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4082
|
+
const int i = row*ncols + col;
|
4083
|
+
|
4084
|
+
const int k = row/k_rows;
|
4085
|
+
|
4086
|
+
float m_k;
|
4087
|
+
if (k < n_heads_log2_floor) {
|
4088
|
+
m_k = powf(m0, k + 1);
|
4089
|
+
} else {
|
4090
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4091
|
+
}
|
4092
|
+
|
4093
|
+
dst[i] = col * m_k + x[i];
|
4094
|
+
}
|
4095
|
+
|
4096
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4097
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4098
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3946
4099
|
|
3947
4100
|
if (col >= ncols) {
|
3948
4101
|
return;
|
@@ -3955,24 +4108,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3955
4108
|
|
3956
4109
|
// the CUDA soft max implementation differs from the CPU implementation
|
3957
4110
|
// instead of doubles floats are used
|
3958
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3959
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3960
4111
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3961
|
-
const int row = blockDim.
|
3962
|
-
const int block_size = blockDim.
|
3963
|
-
const int tid = threadIdx.
|
4112
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4113
|
+
const int block_size = blockDim.y;
|
4114
|
+
const int tid = threadIdx.y;
|
3964
4115
|
|
3965
|
-
float
|
4116
|
+
float max_val = -INFINITY;
|
3966
4117
|
|
3967
|
-
for (int
|
3968
|
-
const int
|
4118
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4119
|
+
const int i = row*ncols + col;
|
4120
|
+
max_val = max(max_val, x[i]);
|
4121
|
+
}
|
3969
4122
|
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
4123
|
+
// find the max value in the block
|
4124
|
+
#pragma unroll
|
4125
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4126
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4127
|
+
}
|
4128
|
+
|
4129
|
+
float tmp = 0.f;
|
3973
4130
|
|
4131
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3974
4132
|
const int i = row*ncols + col;
|
3975
|
-
const float val = expf(x[i]);
|
4133
|
+
const float val = expf(x[i] - max_val);
|
3976
4134
|
tmp += val;
|
3977
4135
|
dst[i] = val;
|
3978
4136
|
}
|
@@ -3983,15 +4141,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3983
4141
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3984
4142
|
}
|
3985
4143
|
|
3986
|
-
|
3987
|
-
const int col = block_start + tid;
|
3988
|
-
|
3989
|
-
if (col >= ncols) {
|
3990
|
-
break;
|
3991
|
-
}
|
4144
|
+
const float inv_tmp = 1.f / tmp;
|
3992
4145
|
|
4146
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3993
4147
|
const int i = row*ncols + col;
|
3994
|
-
dst[i]
|
4148
|
+
dst[i] *= inv_tmp;
|
3995
4149
|
}
|
3996
4150
|
}
|
3997
4151
|
|
@@ -4561,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4561
4715
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
4562
4716
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
4563
4717
|
|
4718
|
+
#if QK_K == 256
|
4719
|
+
|
4564
4720
|
int id;
|
4565
4721
|
CUDA_CHECK(cudaGetDevice(&id));
|
4566
4722
|
const int compute_capability = g_compute_capabilities[id];
|
@@ -4592,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
|
4592
4748
|
mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
|
4593
4749
|
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
4594
4750
|
}
|
4751
|
+
#endif
|
4595
4752
|
}
|
4596
4753
|
|
4597
4754
|
static void ggml_mul_mat_q4_K_q8_1_cuda(
|
@@ -4751,13 +4908,22 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4751
4908
|
|
4752
4909
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4753
4910
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4754
|
-
GGML_ASSERT(
|
4755
|
-
const dim3 block_dims(
|
4911
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4912
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4756
4913
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4757
|
-
const dim3 block_nums(
|
4914
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4758
4915
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4759
4916
|
}
|
4760
4917
|
|
4918
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4919
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4920
|
+
GGML_ASSERT(ncols % 2 == 0);
|
4921
|
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
4922
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4923
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4924
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4925
|
+
}
|
4926
|
+
|
4761
4927
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4762
4928
|
GGML_ASSERT(nrows % 4 == 0);
|
4763
4929
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4766,16 +4932,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4766
4932
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4767
4933
|
}
|
4768
4934
|
|
4935
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4936
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4937
|
+
const float m1, cudaStream_t stream) {
|
4938
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4939
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4940
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4941
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4942
|
+
}
|
4943
|
+
|
4769
4944
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4770
|
-
const dim3 block_dims(
|
4945
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4771
4946
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4772
|
-
const dim3 block_nums(
|
4947
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4773
4948
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4774
4949
|
}
|
4775
4950
|
|
4776
4951
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4777
|
-
const dim3 block_dims(
|
4778
|
-
const dim3 block_nums(
|
4952
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4953
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4779
4954
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4780
4955
|
}
|
4781
4956
|
|
@@ -4880,10 +5055,18 @@ void ggml_init_cublas() {
|
|
4880
5055
|
static bool initialized = false;
|
4881
5056
|
|
4882
5057
|
if (!initialized) {
|
5058
|
+
|
5059
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5060
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5061
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5062
|
+
rocblas_initialize();
|
5063
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5064
|
+
#endif
|
5065
|
+
|
4883
5066
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4884
5067
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4885
5068
|
int64_t total_vram = 0;
|
4886
|
-
fprintf(stderr, "%s: found %d
|
5069
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4887
5070
|
for (int id = 0; id < g_device_count; ++id) {
|
4888
5071
|
cudaDeviceProp prop;
|
4889
5072
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5481,7 +5664,8 @@ inline void ggml_cuda_op_rope(
|
|
5481
5664
|
|
5482
5665
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5483
5666
|
|
5484
|
-
const bool
|
5667
|
+
const bool is_neox = mode & 2;
|
5668
|
+
const bool is_glm = mode & 4;
|
5485
5669
|
|
5486
5670
|
// compute
|
5487
5671
|
if (is_glm) {
|
@@ -5489,6 +5673,10 @@ inline void ggml_cuda_op_rope(
|
|
5489
5673
|
const float id_p = min(p, n_ctx - 2.f);
|
5490
5674
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5491
5675
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5676
|
+
} else if (is_neox) {
|
5677
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5678
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5679
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5492
5680
|
} else {
|
5493
5681
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5494
5682
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5501,6 +5689,41 @@ inline void ggml_cuda_op_rope(
|
|
5501
5689
|
(void) i1;
|
5502
5690
|
}
|
5503
5691
|
|
5692
|
+
inline void ggml_cuda_op_alibi(
|
5693
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5694
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5695
|
+
cudaStream_t & cudaStream_main){
|
5696
|
+
|
5697
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5698
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5699
|
+
|
5700
|
+
const int64_t ne00 = src0->ne[0];
|
5701
|
+
const int64_t ne01 = src0->ne[1];
|
5702
|
+
const int64_t ne02 = src0->ne[2];
|
5703
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5704
|
+
|
5705
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5706
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5707
|
+
float max_bias;
|
5708
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5709
|
+
|
5710
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5711
|
+
GGML_ASSERT(n_head == ne02);
|
5712
|
+
|
5713
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5714
|
+
|
5715
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5716
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5717
|
+
|
5718
|
+
// compute
|
5719
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5720
|
+
|
5721
|
+
(void) src1;
|
5722
|
+
(void) src0_ddq_i;
|
5723
|
+
(void) src1_ddf_i;
|
5724
|
+
(void) i1;
|
5725
|
+
}
|
5726
|
+
|
5504
5727
|
inline void ggml_cuda_op_diag_mask_inf(
|
5505
5728
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5506
5729
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -6115,12 +6338,19 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
|
|
6115
6338
|
|
6116
6339
|
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6117
6340
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6341
|
+
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6118
6342
|
|
6119
6343
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6120
6344
|
const bool is_glm = mode & 4;
|
6345
|
+
|
6121
6346
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6122
6347
|
}
|
6123
6348
|
|
6349
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6350
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6351
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6352
|
+
}
|
6353
|
+
|
6124
6354
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6125
6355
|
(void) src0;
|
6126
6356
|
(void) src1;
|
@@ -6240,7 +6470,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6240
6470
|
return extra;
|
6241
6471
|
}
|
6242
6472
|
|
6243
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6473
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6244
6474
|
if (scratch && g_scratch_size == 0) {
|
6245
6475
|
return;
|
6246
6476
|
}
|
@@ -6249,14 +6479,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6249
6479
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6250
6480
|
const ggml_op src0_op = tensor->src[0]->op;
|
6251
6481
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6252
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6482
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6253
6483
|
}
|
6254
6484
|
}
|
6255
6485
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6256
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6486
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6257
6487
|
}
|
6258
6488
|
|
6259
6489
|
tensor->backend = GGML_BACKEND_GPU;
|
6490
|
+
|
6491
|
+
if (scratch && no_alloc) {
|
6492
|
+
return;
|
6493
|
+
}
|
6494
|
+
|
6260
6495
|
struct ggml_tensor_extra_gpu * extra;
|
6261
6496
|
|
6262
6497
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6308,16 +6543,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6308
6543
|
tensor->extra = extra;
|
6309
6544
|
}
|
6310
6545
|
|
6546
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6547
|
+
if (g_scratch_size == 0) {
|
6548
|
+
return;
|
6549
|
+
}
|
6550
|
+
if (g_scratch_buffer == nullptr) {
|
6551
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6552
|
+
}
|
6553
|
+
|
6554
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6555
|
+
|
6556
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6557
|
+
tensor->op == GGML_OP_VIEW;
|
6558
|
+
|
6559
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6560
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6561
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6562
|
+
size_t view_offset = 0;
|
6563
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6564
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6565
|
+
}
|
6566
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6567
|
+
} else {
|
6568
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6569
|
+
}
|
6570
|
+
|
6571
|
+
tensor->extra = extra;
|
6572
|
+
}
|
6573
|
+
|
6311
6574
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6312
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6576
|
+
}
|
6577
|
+
|
6578
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6579
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6313
6580
|
}
|
6314
6581
|
|
6315
6582
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6316
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6583
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6317
6584
|
}
|
6318
6585
|
|
6319
6586
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6320
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6587
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6321
6588
|
}
|
6322
6589
|
|
6323
6590
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6456,6 +6723,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6456
6723
|
}
|
6457
6724
|
func = ggml_cuda_rope;
|
6458
6725
|
break;
|
6726
|
+
case GGML_OP_ALIBI:
|
6727
|
+
if (!any_on_device) {
|
6728
|
+
return false;
|
6729
|
+
}
|
6730
|
+
func = ggml_cuda_alibi;
|
6731
|
+
break;
|
6459
6732
|
default:
|
6460
6733
|
return false;
|
6461
6734
|
}
|