llama_cpp 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
@@ -6,15 +6,116 @@
|
|
6
6
|
#include <atomic>
|
7
7
|
#include <assert.h>
|
8
8
|
|
9
|
+
#if defined(GGML_USE_HIPBLAS)
|
10
|
+
#include <hip/hip_runtime.h>
|
11
|
+
#include <hipblas/hipblas.h>
|
12
|
+
#include <hip/hip_fp16.h>
|
13
|
+
#ifdef __HIP_PLATFORM_AMD__
|
14
|
+
// for rocblas_initialize()
|
15
|
+
#include "rocblas/rocblas.h"
|
16
|
+
#endif
|
17
|
+
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
|
+
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
|
+
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
20
|
+
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
|
+
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
|
+
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
23
|
+
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
24
|
+
#define CUDA_R_16F HIPBLAS_R_16F
|
25
|
+
#define CUDA_R_32F HIPBLAS_R_32F
|
26
|
+
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
27
|
+
#define cublasCreate hipblasCreate
|
28
|
+
#define cublasGemmEx hipblasGemmEx
|
29
|
+
#define cublasHandle_t hipblasHandle_t
|
30
|
+
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
31
|
+
#define cublasSetStream hipblasSetStream
|
32
|
+
#define cublasSgemm hipblasSgemm
|
33
|
+
#define cublasStatus_t hipblasStatus_t
|
34
|
+
#define cudaDeviceProp hipDeviceProp_t
|
35
|
+
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
|
+
#define cudaError_t hipError_t
|
37
|
+
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
38
|
+
#define cudaEventDisableTiming hipEventDisableTiming
|
39
|
+
#define cudaEventRecord hipEventRecord
|
40
|
+
#define cudaEvent_t hipEvent_t
|
41
|
+
#define cudaEventDestroy hipEventDestroy
|
42
|
+
#define cudaFree hipFree
|
43
|
+
#define cudaFreeHost hipHostFree
|
44
|
+
#define cudaGetDevice hipGetDevice
|
45
|
+
#define cudaGetDeviceCount hipGetDeviceCount
|
46
|
+
#define cudaGetDeviceProperties hipGetDeviceProperties
|
47
|
+
#define cudaGetErrorString hipGetErrorString
|
48
|
+
#define cudaGetLastError hipGetLastError
|
49
|
+
#define cudaMalloc hipMalloc
|
50
|
+
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
51
|
+
#define cudaMemcpy hipMemcpy
|
52
|
+
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
53
|
+
#define cudaMemcpyAsync hipMemcpyAsync
|
54
|
+
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
55
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
56
|
+
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
57
|
+
#define cudaMemcpyKind hipMemcpyKind
|
58
|
+
#define cudaMemset hipMemset
|
59
|
+
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
60
|
+
#define cudaSetDevice hipSetDevice
|
61
|
+
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
|
+
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
|
+
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
+
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
65
|
+
#define cudaStream_t hipStream_t
|
66
|
+
#define cudaSuccess hipSuccess
|
67
|
+
#else
|
9
68
|
#include <cuda_runtime.h>
|
10
69
|
#include <cublas_v2.h>
|
11
70
|
#include <cuda_fp16.h>
|
71
|
+
#endif
|
12
72
|
|
13
73
|
#include "ggml-cuda.h"
|
14
74
|
#include "ggml.h"
|
15
75
|
|
16
76
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
77
|
+
#ifndef CC_TURING
|
17
78
|
#define CC_TURING 700
|
79
|
+
#endif
|
80
|
+
|
81
|
+
#if defined(GGML_USE_HIPBLAS)
|
82
|
+
#define __CUDA_ARCH__ 1300
|
83
|
+
|
84
|
+
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
85
|
+
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
86
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
87
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
88
|
+
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
89
|
+
return reinterpret_cast<const int&>(c);
|
90
|
+
}
|
91
|
+
|
92
|
+
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
93
|
+
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
94
|
+
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
95
|
+
#elif defined(__gfx1100__)
|
96
|
+
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
97
|
+
#elif defined(__gfx1010__) || defined(__gfx900__)
|
98
|
+
int tmp1;
|
99
|
+
int tmp2;
|
100
|
+
asm("\n \
|
101
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
102
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
103
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
104
|
+
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
105
|
+
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
106
|
+
v_add3_u32 %0, %1, %2, %0 \n \
|
107
|
+
"
|
108
|
+
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
109
|
+
: "v"(a), "v"(b)
|
110
|
+
);
|
111
|
+
#else
|
112
|
+
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
113
|
+
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
114
|
+
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
115
|
+
#endif
|
116
|
+
return c;
|
117
|
+
}
|
118
|
+
#endif
|
18
119
|
|
19
120
|
#if defined(_MSC_VER)
|
20
121
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -259,6 +360,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
259
360
|
#define CUDA_CPY_BLOCK_SIZE 32
|
260
361
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
261
362
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
363
|
+
#define CUDA_ALIBI_BLOCK_SIZE 32
|
262
364
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
263
365
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
264
366
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
@@ -286,7 +388,7 @@ static int g_device_count = -1;
|
|
286
388
|
static int g_main_device = 0;
|
287
389
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
288
390
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
289
|
-
static bool g_mul_mat_q =
|
391
|
+
static bool g_mul_mat_q = true;
|
290
392
|
|
291
393
|
static void * g_scratch_buffer = nullptr;
|
292
394
|
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
@@ -423,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|
423
525
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
424
526
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
425
527
|
|
426
|
-
const dfloat d = x[ib].dm
|
427
|
-
const dfloat m = x[ib].dm
|
528
|
+
const dfloat d = __low2half(x[ib].dm);
|
529
|
+
const dfloat m = __high2half(x[ib].dm);
|
428
530
|
|
429
531
|
const int vui = x[ib].qs[iqs];
|
430
532
|
|
@@ -466,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|
466
568
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
467
569
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
468
570
|
|
469
|
-
const dfloat d = x[ib].dm
|
470
|
-
const dfloat m = x[ib].dm
|
571
|
+
const dfloat d = __low2half(x[ib].dm);
|
572
|
+
const dfloat m = __high2half(x[ib].dm);
|
471
573
|
|
472
574
|
uint32_t qh;
|
473
575
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
@@ -519,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
519
621
|
const uint8_t q = x[i].qs[32*n + l];
|
520
622
|
float * y = yy + i*QK_K + 128*n;
|
521
623
|
|
522
|
-
float dall = x[i].dm
|
523
|
-
float dmin = x[i].dm
|
624
|
+
float dall = __low2half(x[i].dm);
|
625
|
+
float dmin = __high2half(x[i].dm);
|
524
626
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
525
627
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
526
628
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
@@ -530,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|
530
632
|
const int il = tid%16; // 0...15
|
531
633
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
532
634
|
float * y = yy + i*QK_K + 16*is + il;
|
533
|
-
float dall = x[i].dm
|
534
|
-
float dmin = x[i].dm
|
635
|
+
float dall = __low2half(x[i].dm);
|
636
|
+
float dmin = __high2half(x[i].dm);
|
535
637
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
536
638
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
537
639
|
#endif
|
@@ -617,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|
617
719
|
|
618
720
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
619
721
|
|
620
|
-
const float dall = x[i].dm
|
621
|
-
const float dmin = x[i].dm
|
722
|
+
const float dall = __low2half(x[i].dm);
|
723
|
+
const float dmin = __high2half(x[i].dm);
|
622
724
|
|
623
725
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
624
726
|
|
@@ -656,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|
656
758
|
|
657
759
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
658
760
|
|
659
|
-
const float dall = x[i].dm
|
660
|
-
const float dmin = x[i].dm
|
761
|
+
const float dall = __low2half(x[i].dm);
|
762
|
+
const float dmin = __high2half(x[i].dm);
|
661
763
|
|
662
764
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
663
765
|
const uint8_t * qh = x[i].qh + 2*ir;
|
@@ -769,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|
769
871
|
const float * y = yy + i * QK_K + y_offset;
|
770
872
|
const uint8_t * q = x[i].qs + q_offset;
|
771
873
|
|
772
|
-
const float dall = x[i].dm
|
773
|
-
const float dmin = x[i].dm
|
874
|
+
const float dall = __low2half(x[i].dm);
|
875
|
+
const float dmin = __high2half(x[i].dm);
|
774
876
|
|
775
877
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
776
878
|
aux[0] = a[0] & 0x0f0f0f0f;
|
@@ -990,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
990
1092
|
const float * y1 = yy + i*QK_K + y_offset;
|
991
1093
|
const float * y2 = y1 + 128;
|
992
1094
|
|
993
|
-
const float dall = x[i].dm
|
994
|
-
const float dmin = x[i].dm
|
1095
|
+
const float dall = __low2half(x[i].dm);
|
1096
|
+
const float dmin = __high2half(x[i].dm);
|
995
1097
|
|
996
1098
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
997
1099
|
aux[0] = a[im+0] & kmask1;
|
@@ -1123,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
1123
1225
|
const float * y1 = yy + i*QK_K + y_offset;
|
1124
1226
|
const float * y2 = y1 + 128;
|
1125
1227
|
|
1126
|
-
const float dall = x[i].dm
|
1127
|
-
const float dmin = x[i].dm
|
1228
|
+
const float dall = __low2half(x[i].dm);
|
1229
|
+
const float dmin = __high2half(x[i].dm);
|
1128
1230
|
|
1129
1231
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
1130
1232
|
aux[0] = a[im+0] & kmask1;
|
@@ -1347,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1347
1449
|
return;
|
1348
1450
|
}
|
1349
1451
|
|
1350
|
-
y[ib].ds.x = d;
|
1351
|
-
y[ib].ds.y = sum;
|
1452
|
+
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
1453
|
+
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1352
1454
|
}
|
1353
1455
|
|
1354
1456
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -2345,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2345
2447
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
2346
2448
|
}
|
2347
2449
|
|
2348
|
-
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds
|
2450
|
+
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
2349
2451
|
}
|
2350
2452
|
|
2351
2453
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
@@ -2431,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2431
2533
|
#pragma unroll
|
2432
2534
|
for (int i = 0; i < QR2_K; ++ i) {
|
2433
2535
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2434
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2536
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2435
2537
|
}
|
2436
2538
|
|
2437
2539
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
@@ -2550,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
2550
2652
|
#pragma unroll
|
2551
2653
|
for (int i = 0; i < QR3_K; ++i) {
|
2552
2654
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
2553
|
-
d8[i] = bq8_1[bq8_offset + i].ds
|
2655
|
+
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
2554
2656
|
}
|
2555
2657
|
|
2556
2658
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
@@ -2719,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2719
2821
|
|
2720
2822
|
for (int i = 0; i < QR4_K; ++i) {
|
2721
2823
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2722
|
-
d8[i] = bq8i->ds
|
2824
|
+
d8[i] = __low2half(bq8i->ds);
|
2723
2825
|
|
2724
2826
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2725
2827
|
u[2*i+0] = q8[0];
|
@@ -2746,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
2746
2848
|
const float dall = bq4_K->d[0];
|
2747
2849
|
const float dmin = bq4_K->d[1];
|
2748
2850
|
|
2749
|
-
const float d8_1 = bq8_1[0].ds
|
2750
|
-
const float d8_2 = bq8_1[1].ds
|
2851
|
+
const float d8_1 = __low2float(bq8_1[0].ds);
|
2852
|
+
const float d8_2 = __low2float(bq8_1[1].ds);
|
2751
2853
|
|
2752
2854
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2753
2855
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -2900,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2900
3002
|
#pragma unroll
|
2901
3003
|
for (int i = 0; i < QR5_K; ++i) {
|
2902
3004
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
2903
|
-
d8[i] = bq8i->ds
|
3005
|
+
d8[i] = __low2float(bq8i->ds);
|
2904
3006
|
|
2905
3007
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
2906
3008
|
u[2*i+0] = q8[0];
|
@@ -2918,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
2918
3020
|
|
2919
3021
|
const float d = bq5_K->d;
|
2920
3022
|
|
2921
|
-
const float d8_1 = bq8_1[0].ds
|
2922
|
-
const float d8_2 = bq8_1[1].ds
|
3023
|
+
const float d8_1 = __low2half(bq8_1[0].ds);
|
3024
|
+
const float d8_2 = __low2half(bq8_1[1].ds);
|
2923
3025
|
|
2924
3026
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
2925
3027
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
@@ -3074,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3074
3176
|
#pragma unroll
|
3075
3177
|
for (int i = 0; i < QR6_K; ++i) {
|
3076
3178
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
3077
|
-
d8[i] = bq8_1[bq8_offset + 2*i].ds
|
3179
|
+
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
3078
3180
|
}
|
3079
3181
|
|
3080
3182
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
@@ -3242,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3242
3344
|
*dsi_dst = *dsi_src;
|
3243
3345
|
} else {
|
3244
3346
|
float * dfi_dst = (float *) dsi_dst;
|
3245
|
-
*dfi_dst = (*dsi_src)
|
3347
|
+
*dfi_dst = __low2half(*dsi_src);
|
3246
3348
|
}
|
3247
3349
|
}
|
3248
3350
|
|
@@ -3886,13 +3988,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
3886
3988
|
// rope == RoPE == rotary positional embedding
|
3887
3989
|
static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
|
3888
3990
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
3889
|
-
const int col = 2*(blockDim.
|
3991
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
3890
3992
|
|
3891
3993
|
if (col >= ncols) {
|
3892
3994
|
return;
|
3893
3995
|
}
|
3894
3996
|
|
3895
|
-
const int row = blockDim.
|
3997
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3896
3998
|
const int i = row*ncols + col;
|
3897
3999
|
|
3898
4000
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
@@ -3906,6 +4008,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
3906
4008
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
3907
4009
|
}
|
3908
4010
|
|
4011
|
+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
4012
|
+
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
4013
|
+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4014
|
+
|
4015
|
+
if (col >= ncols) {
|
4016
|
+
return;
|
4017
|
+
}
|
4018
|
+
|
4019
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4020
|
+
const int i = row*ncols + col/2;
|
4021
|
+
|
4022
|
+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
4023
|
+
const float sin_theta = sinf(theta);
|
4024
|
+
const float cos_theta = cosf(theta);
|
4025
|
+
|
4026
|
+
const float x0 = x[i + 0];
|
4027
|
+
const float x1 = x[i + ncols/2];
|
4028
|
+
|
4029
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4030
|
+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4031
|
+
}
|
4032
|
+
|
3909
4033
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
3910
4034
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
3911
4035
|
const int half_n_dims = ncols/4;
|
@@ -3940,9 +4064,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
3940
4064
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
3941
4065
|
}
|
3942
4066
|
|
3943
|
-
static __global__ void
|
4067
|
+
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
4068
|
+
const int n_heads_log2_floor, const float m0, const float m1) {
|
3944
4069
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4070
|
+
|
4071
|
+
if (col >= ncols) {
|
4072
|
+
return;
|
4073
|
+
}
|
4074
|
+
|
3945
4075
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4076
|
+
const int i = row*ncols + col;
|
4077
|
+
|
4078
|
+
const int k = row/k_rows;
|
4079
|
+
|
4080
|
+
float m_k;
|
4081
|
+
if (k < n_heads_log2_floor) {
|
4082
|
+
m_k = powf(m0, k + 1);
|
4083
|
+
} else {
|
4084
|
+
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
4085
|
+
}
|
4086
|
+
|
4087
|
+
dst[i] = col * m_k + x[i];
|
4088
|
+
}
|
4089
|
+
|
4090
|
+
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
4091
|
+
const int col = blockDim.y*blockIdx.y + threadIdx.y;
|
4092
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
3946
4093
|
|
3947
4094
|
if (col >= ncols) {
|
3948
4095
|
return;
|
@@ -3955,24 +4102,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
3955
4102
|
|
3956
4103
|
// the CUDA soft max implementation differs from the CPU implementation
|
3957
4104
|
// instead of doubles floats are used
|
3958
|
-
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
3959
|
-
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
3960
4105
|
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
3961
|
-
const int row = blockDim.
|
3962
|
-
const int block_size = blockDim.
|
3963
|
-
const int tid = threadIdx.
|
4106
|
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4107
|
+
const int block_size = blockDim.y;
|
4108
|
+
const int tid = threadIdx.y;
|
3964
4109
|
|
3965
|
-
float
|
4110
|
+
float max_val = -INFINITY;
|
3966
4111
|
|
3967
|
-
for (int
|
3968
|
-
const int
|
4112
|
+
for (int col = tid; col < ncols; col += block_size) {
|
4113
|
+
const int i = row*ncols + col;
|
4114
|
+
max_val = max(max_val, x[i]);
|
4115
|
+
}
|
3969
4116
|
|
3970
|
-
|
3971
|
-
|
3972
|
-
|
4117
|
+
// find the max value in the block
|
4118
|
+
#pragma unroll
|
4119
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
4120
|
+
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
4121
|
+
}
|
4122
|
+
|
4123
|
+
float tmp = 0.f;
|
3973
4124
|
|
4125
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3974
4126
|
const int i = row*ncols + col;
|
3975
|
-
const float val = expf(x[i]);
|
4127
|
+
const float val = expf(x[i] - max_val);
|
3976
4128
|
tmp += val;
|
3977
4129
|
dst[i] = val;
|
3978
4130
|
}
|
@@ -3983,15 +4135,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
|
3983
4135
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
3984
4136
|
}
|
3985
4137
|
|
3986
|
-
|
3987
|
-
const int col = block_start + tid;
|
3988
|
-
|
3989
|
-
if (col >= ncols) {
|
3990
|
-
break;
|
3991
|
-
}
|
4138
|
+
const float inv_tmp = 1.f / tmp;
|
3992
4139
|
|
4140
|
+
for (int col = tid; col < ncols; col += block_size) {
|
3993
4141
|
const int i = row*ncols + col;
|
3994
|
-
dst[i]
|
4142
|
+
dst[i] *= inv_tmp;
|
3995
4143
|
}
|
3996
4144
|
}
|
3997
4145
|
|
@@ -4751,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
4751
4899
|
|
4752
4900
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4753
4901
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4754
|
-
GGML_ASSERT(nrows % 2 == 0);
|
4755
|
-
const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1
|
4902
|
+
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
4903
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4756
4904
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4757
|
-
const dim3 block_nums(
|
4905
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4758
4906
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4759
4907
|
}
|
4760
4908
|
|
4909
|
+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
4910
|
+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
4911
|
+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
4912
|
+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
4913
|
+
const dim3 block_nums(nrows, num_blocks_x, 1);
|
4914
|
+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
4915
|
+
}
|
4916
|
+
|
4761
4917
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
4762
4918
|
GGML_ASSERT(nrows % 4 == 0);
|
4763
4919
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
@@ -4766,16 +4922,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
|
|
4766
4922
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
4767
4923
|
}
|
4768
4924
|
|
4925
|
+
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
4926
|
+
const int k_rows, const int n_heads_log2_floor, const float m0,
|
4927
|
+
const float m1, cudaStream_t stream) {
|
4928
|
+
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
4929
|
+
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
4930
|
+
const dim3 block_nums(num_blocks_x, nrows, 1);
|
4931
|
+
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
4932
|
+
}
|
4933
|
+
|
4769
4934
|
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
4770
|
-
const dim3 block_dims(
|
4935
|
+
const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
|
4771
4936
|
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
4772
|
-
const dim3 block_nums(
|
4937
|
+
const dim3 block_nums(nrows_x, block_num_x, 1);
|
4773
4938
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
4774
4939
|
}
|
4775
4940
|
|
4776
4941
|
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
4777
|
-
const dim3 block_dims(
|
4778
|
-
const dim3 block_nums(
|
4942
|
+
const dim3 block_dims(1, WARP_SIZE, 1);
|
4943
|
+
const dim3 block_nums(nrows_x, 1, 1);
|
4779
4944
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
4780
4945
|
}
|
4781
4946
|
|
@@ -4880,10 +5045,18 @@ void ggml_init_cublas() {
|
|
4880
5045
|
static bool initialized = false;
|
4881
5046
|
|
4882
5047
|
if (!initialized) {
|
5048
|
+
|
5049
|
+
#ifdef __HIP_PLATFORM_AMD__
|
5050
|
+
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
5051
|
+
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
5052
|
+
rocblas_initialize();
|
5053
|
+
CUDA_CHECK(cudaDeviceSynchronize());
|
5054
|
+
#endif
|
5055
|
+
|
4883
5056
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
4884
5057
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
4885
5058
|
int64_t total_vram = 0;
|
4886
|
-
fprintf(stderr, "%s: found %d
|
5059
|
+
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
4887
5060
|
for (int id = 0; id < g_device_count; ++id) {
|
4888
5061
|
cudaDeviceProp prop;
|
4889
5062
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
@@ -5481,7 +5654,8 @@ inline void ggml_cuda_op_rope(
|
|
5481
5654
|
|
5482
5655
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5483
5656
|
|
5484
|
-
const bool
|
5657
|
+
const bool is_neox = mode & 2;
|
5658
|
+
const bool is_glm = mode & 4;
|
5485
5659
|
|
5486
5660
|
// compute
|
5487
5661
|
if (is_glm) {
|
@@ -5489,6 +5663,10 @@ inline void ggml_cuda_op_rope(
|
|
5489
5663
|
const float id_p = min(p, n_ctx - 2.f);
|
5490
5664
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
5491
5665
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
5666
|
+
} else if (is_neox) {
|
5667
|
+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
5668
|
+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5669
|
+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
5492
5670
|
} else {
|
5493
5671
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
5494
5672
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
@@ -5501,6 +5679,41 @@ inline void ggml_cuda_op_rope(
|
|
5501
5679
|
(void) i1;
|
5502
5680
|
}
|
5503
5681
|
|
5682
|
+
inline void ggml_cuda_op_alibi(
|
5683
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5684
|
+
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
5685
|
+
cudaStream_t & cudaStream_main){
|
5686
|
+
|
5687
|
+
GGML_ASSERT(src0_ddf_i != nullptr);
|
5688
|
+
GGML_ASSERT(dst_ddf_i != nullptr);
|
5689
|
+
|
5690
|
+
const int64_t ne00 = src0->ne[0];
|
5691
|
+
const int64_t ne01 = src0->ne[1];
|
5692
|
+
const int64_t ne02 = src0->ne[2];
|
5693
|
+
const int64_t i01_diff = i01_high - i01_low;
|
5694
|
+
|
5695
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
5696
|
+
const int n_head = ((int32_t *) dst->op_params)[1];
|
5697
|
+
float max_bias;
|
5698
|
+
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
5699
|
+
|
5700
|
+
GGML_ASSERT(ne01 + n_past == ne00);
|
5701
|
+
GGML_ASSERT(n_head == ne02);
|
5702
|
+
|
5703
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
5704
|
+
|
5705
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
5706
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
5707
|
+
|
5708
|
+
// compute
|
5709
|
+
alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
|
5710
|
+
|
5711
|
+
(void) src1;
|
5712
|
+
(void) src0_ddq_i;
|
5713
|
+
(void) src1_ddf_i;
|
5714
|
+
(void) i1;
|
5715
|
+
}
|
5716
|
+
|
5504
5717
|
inline void ggml_cuda_op_diag_mask_inf(
|
5505
5718
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
5506
5719
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
@@ -6121,6 +6334,11 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|
6121
6334
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
6122
6335
|
}
|
6123
6336
|
|
6337
|
+
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6338
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
6339
|
+
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
|
6340
|
+
}
|
6341
|
+
|
6124
6342
|
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6125
6343
|
(void) src0;
|
6126
6344
|
(void) src1;
|
@@ -6240,7 +6458,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6240
6458
|
return extra;
|
6241
6459
|
}
|
6242
6460
|
|
6243
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
6461
|
+
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6244
6462
|
if (scratch && g_scratch_size == 0) {
|
6245
6463
|
return;
|
6246
6464
|
}
|
@@ -6249,14 +6467,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6249
6467
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6250
6468
|
const ggml_op src0_op = tensor->src[0]->op;
|
6251
6469
|
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
6252
|
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
6470
|
+
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
6253
6471
|
}
|
6254
6472
|
}
|
6255
6473
|
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
6256
|
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
|
6474
|
+
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6257
6475
|
}
|
6258
6476
|
|
6259
6477
|
tensor->backend = GGML_BACKEND_GPU;
|
6478
|
+
|
6479
|
+
if (scratch && no_alloc) {
|
6480
|
+
return;
|
6481
|
+
}
|
6482
|
+
|
6260
6483
|
struct ggml_tensor_extra_gpu * extra;
|
6261
6484
|
|
6262
6485
|
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
@@ -6308,16 +6531,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6308
6531
|
tensor->extra = extra;
|
6309
6532
|
}
|
6310
6533
|
|
6534
|
+
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
6535
|
+
if (g_scratch_size == 0) {
|
6536
|
+
return;
|
6537
|
+
}
|
6538
|
+
if (g_scratch_buffer == nullptr) {
|
6539
|
+
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6540
|
+
}
|
6541
|
+
|
6542
|
+
struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
6543
|
+
|
6544
|
+
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
6545
|
+
tensor->op == GGML_OP_VIEW;
|
6546
|
+
|
6547
|
+
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
6548
|
+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
6549
|
+
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6550
|
+
size_t view_offset = 0;
|
6551
|
+
if (tensor->op == GGML_OP_VIEW) {
|
6552
|
+
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
6553
|
+
}
|
6554
|
+
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
6555
|
+
} else {
|
6556
|
+
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
6557
|
+
}
|
6558
|
+
|
6559
|
+
tensor->extra = extra;
|
6560
|
+
}
|
6561
|
+
|
6311
6562
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6312
|
-
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
6563
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6564
|
+
}
|
6565
|
+
|
6566
|
+
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
6567
|
+
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
6313
6568
|
}
|
6314
6569
|
|
6315
6570
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
6316
|
-
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
6571
|
+
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
6317
6572
|
}
|
6318
6573
|
|
6319
6574
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
6320
|
-
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
6575
|
+
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
6321
6576
|
}
|
6322
6577
|
|
6323
6578
|
void ggml_cuda_set_main_device(int main_device) {
|
@@ -6456,6 +6711,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
6456
6711
|
}
|
6457
6712
|
func = ggml_cuda_rope;
|
6458
6713
|
break;
|
6714
|
+
case GGML_OP_ALIBI:
|
6715
|
+
if (!any_on_device) {
|
6716
|
+
return false;
|
6717
|
+
}
|
6718
|
+
func = ggml_cuda_alibi;
|
6719
|
+
break;
|
6459
6720
|
default:
|
6460
6721
|
return false;
|
6461
6722
|
}
|