llama_cpp 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/README.md +6 -5
- data/examples/chat.rb +13 -13
- data/examples/embedding.rb +9 -9
- data/ext/llama_cpp/llama_cpp.cpp +547 -272
- data/ext/llama_cpp/src/ggml-alloc.c +14 -8
- data/ext/llama_cpp/src/ggml-alloc.h +1 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +307 -127
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +200 -94
- data/ext/llama_cpp/src/ggml-metal.metal +264 -82
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +1647 -865
- data/ext/llama_cpp/src/ggml.h +143 -52
- data/ext/llama_cpp/src/llama.cpp +1427 -635
- data/ext/llama_cpp/src/llama.h +308 -119
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +5 -9
- data/sig/llama_cpp.rbs +65 -34
- metadata +3 -3
@@ -1,3 +1,4 @@
|
|
1
|
+
#include <algorithm>
|
1
2
|
#include <cstddef>
|
2
3
|
#include <cstdint>
|
3
4
|
#include <limits>
|
@@ -14,9 +15,11 @@
|
|
14
15
|
// for rocblas_initialize()
|
15
16
|
#include "rocblas/rocblas.h"
|
16
17
|
#endif // __HIP_PLATFORM_AMD__
|
18
|
+
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
17
19
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
18
20
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
19
21
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
22
|
+
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
20
23
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
21
24
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
22
25
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
@@ -31,6 +34,9 @@
|
|
31
34
|
#define cublasSetStream hipblasSetStream
|
32
35
|
#define cublasSgemm hipblasSgemm
|
33
36
|
#define cublasStatus_t hipblasStatus_t
|
37
|
+
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
38
|
+
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
39
|
+
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
34
40
|
#define cudaDeviceProp hipDeviceProp_t
|
35
41
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
36
42
|
#define cudaError_t hipError_t
|
@@ -61,7 +67,7 @@
|
|
61
67
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
62
68
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
63
69
|
#define cudaStreamSynchronize hipStreamSynchronize
|
64
|
-
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event,
|
70
|
+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
65
71
|
#define cudaStream_t hipStream_t
|
66
72
|
#define cudaSuccess hipSuccess
|
67
73
|
#else
|
@@ -190,6 +196,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
190
196
|
} while (0)
|
191
197
|
#endif // CUDART_VERSION >= 11
|
192
198
|
|
199
|
+
#if CUDART_VERSION >= 11100
|
200
|
+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
201
|
+
#else
|
202
|
+
#define GGML_CUDA_ASSUME(x)
|
203
|
+
#endif // CUDART_VERSION >= 11100
|
204
|
+
|
193
205
|
#ifdef GGML_CUDA_F16
|
194
206
|
typedef half dfloat; // dequantize float
|
195
207
|
typedef half2 dfloat2;
|
@@ -226,8 +238,12 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
|
|
226
238
|
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
227
239
|
}
|
228
240
|
|
241
|
+
template<typename T>
|
242
|
+
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
243
|
+
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
244
|
+
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
245
|
+
|
229
246
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
230
|
-
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
231
247
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
232
248
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
233
249
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
@@ -418,6 +434,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
418
434
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
419
435
|
#endif
|
420
436
|
|
437
|
+
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
438
|
+
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
439
|
+
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
440
|
+
|
421
441
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
422
442
|
|
423
443
|
#define MAX_STREAMS 8
|
@@ -448,7 +468,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
448
468
|
static bool g_mul_mat_q = true;
|
449
469
|
|
450
470
|
static void * g_scratch_buffer = nullptr;
|
451
|
-
static size_t g_scratch_size =
|
471
|
+
static size_t g_scratch_size = 0; // disabled by default
|
452
472
|
static size_t g_scratch_offset = 0;
|
453
473
|
|
454
474
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
@@ -1502,6 +1522,14 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
1502
1522
|
v.y = x[ib + iqs + 1];
|
1503
1523
|
}
|
1504
1524
|
|
1525
|
+
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
1526
|
+
const float * x = (const float *) vx;
|
1527
|
+
|
1528
|
+
// automatic half -> float type cast if dfloat == float
|
1529
|
+
v.x = x[ib + iqs + 0];
|
1530
|
+
v.y = x[ib + iqs + 1];
|
1531
|
+
}
|
1532
|
+
|
1505
1533
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
1506
1534
|
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
1507
1535
|
|
@@ -1541,8 +1569,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1541
1569
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1542
1570
|
}
|
1543
1571
|
|
1544
|
-
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
1545
|
-
static __global__ void dequantize_block(const void * __restrict__ vx,
|
1572
|
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1573
|
+
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1546
1574
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
1547
1575
|
|
1548
1576
|
if (i >= k) {
|
@@ -2145,10 +2173,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2145
2173
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2146
2174
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2147
2175
|
|
2148
|
-
|
2149
|
-
|
2150
|
-
|
2151
|
-
|
2176
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2177
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2178
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2179
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2152
2180
|
|
2153
2181
|
const int kbx = k / QI4_0;
|
2154
2182
|
const int kqsx = k % QI4_0;
|
@@ -2239,10 +2267,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2239
2267
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2240
2268
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2241
2269
|
|
2242
|
-
|
2243
|
-
|
2244
|
-
|
2245
|
-
|
2270
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2271
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2272
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2273
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2246
2274
|
|
2247
2275
|
const int kbx = k / QI4_1;
|
2248
2276
|
const int kqsx = k % QI4_1;
|
@@ -2331,10 +2359,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2331
2359
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332
2360
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2333
2361
|
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2362
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2363
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2364
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2365
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2338
2366
|
|
2339
2367
|
const int kbx = k / QI5_0;
|
2340
2368
|
const int kqsx = k % QI5_0;
|
@@ -2445,10 +2473,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2445
2473
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2446
2474
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2447
2475
|
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2476
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2477
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2478
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2479
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2452
2480
|
|
2453
2481
|
const int kbx = k / QI5_1;
|
2454
2482
|
const int kqsx = k % QI5_1;
|
@@ -2551,10 +2579,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2551
2579
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2552
2580
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2553
2581
|
|
2554
|
-
|
2555
|
-
|
2556
|
-
|
2557
|
-
|
2582
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2583
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2584
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2585
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2558
2586
|
|
2559
2587
|
const int kbx = k / QI8_0;
|
2560
2588
|
const int kqsx = k % QI8_0;
|
@@ -2642,10 +2670,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2642
2670
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2643
2671
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2644
2672
|
|
2645
|
-
|
2646
|
-
|
2647
|
-
|
2648
|
-
|
2673
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2674
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2675
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2676
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2649
2677
|
|
2650
2678
|
const int kbx = k / QI2_K;
|
2651
2679
|
const int kqsx = k % QI2_K;
|
@@ -2763,10 +2791,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2763
2791
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2764
2792
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2765
2793
|
|
2766
|
-
|
2767
|
-
|
2768
|
-
|
2769
|
-
|
2794
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
2795
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2796
|
+
GGML_CUDA_ASSUME(k >= 0);
|
2797
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2770
2798
|
|
2771
2799
|
const int kbx = k / QI3_K;
|
2772
2800
|
const int kqsx = k % QI3_K;
|
@@ -2981,10 +3009,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2981
3009
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2982
3010
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2983
3011
|
|
2984
|
-
|
2985
|
-
|
2986
|
-
|
2987
|
-
|
3012
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3013
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3014
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3015
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
2988
3016
|
|
2989
3017
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
2990
3018
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
@@ -3162,10 +3190,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3162
3190
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3163
3191
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3164
3192
|
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3193
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3194
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3195
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3196
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3169
3197
|
|
3170
3198
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3171
3199
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
@@ -3291,10 +3319,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3291
3319
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3292
3320
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3293
3321
|
|
3294
|
-
|
3295
|
-
|
3296
|
-
|
3297
|
-
|
3322
|
+
GGML_CUDA_ASSUME(i_offset >= 0);
|
3323
|
+
GGML_CUDA_ASSUME(i_offset < nwarps);
|
3324
|
+
GGML_CUDA_ASSUME(k >= 0);
|
3325
|
+
GGML_CUDA_ASSUME(k < WARP_SIZE);
|
3298
3326
|
|
3299
3327
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3300
3328
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
@@ -4342,8 +4370,10 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
|
4342
4370
|
}
|
4343
4371
|
|
4344
4372
|
// rope == RoPE == rotary positional embedding
|
4345
|
-
|
4346
|
-
|
4373
|
+
|
4374
|
+
template<typename T, bool has_pos>
|
4375
|
+
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4376
|
+
const int p_delta_rows, const float theta_scale) {
|
4347
4377
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4348
4378
|
|
4349
4379
|
if (col >= ncols) {
|
@@ -4352,8 +4382,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4352
4382
|
|
4353
4383
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4354
4384
|
const int i = row*ncols + col;
|
4385
|
+
const int i2 = row/p_delta_rows;
|
4355
4386
|
|
4356
|
-
const
|
4387
|
+
const int p = has_pos ? pos[i2] : 0;
|
4388
|
+
const float p0 = p*freq_scale;
|
4389
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4357
4390
|
const float sin_theta = sinf(theta);
|
4358
4391
|
const float cos_theta = cosf(theta);
|
4359
4392
|
|
@@ -4364,8 +4397,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|
4364
4397
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
4365
4398
|
}
|
4366
4399
|
|
4367
|
-
|
4368
|
-
|
4400
|
+
template<typename T, bool has_pos>
|
4401
|
+
static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4402
|
+
const int p_delta_rows, const float theta_scale) {
|
4369
4403
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4370
4404
|
|
4371
4405
|
if (col >= ncols) {
|
@@ -4374,8 +4408,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4374
4408
|
|
4375
4409
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4376
4410
|
const int i = row*ncols + col/2;
|
4411
|
+
const int i2 = row/p_delta_rows;
|
4377
4412
|
|
4378
|
-
const
|
4413
|
+
const int p = has_pos ? pos[i2] : 0;
|
4414
|
+
const float p0 = p*freq_scale;
|
4415
|
+
const float theta = p0*powf(theta_scale, col/2);
|
4379
4416
|
const float sin_theta = sinf(theta);
|
4380
4417
|
const float cos_theta = cosf(theta);
|
4381
4418
|
|
@@ -4386,8 +4423,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|
4386
4423
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
4387
4424
|
}
|
4388
4425
|
|
4389
|
-
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float
|
4390
|
-
const
|
4426
|
+
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
4427
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
4391
4428
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
4392
4429
|
const int half_n_dims = ncols/4;
|
4393
4430
|
|
@@ -4397,11 +4434,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4397
4434
|
|
4398
4435
|
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
4399
4436
|
const int i = row*ncols + col;
|
4437
|
+
const int i2 = row/p_delta_rows;
|
4400
4438
|
|
4401
4439
|
const float col_theta_scale = powf(theta_scale, col);
|
4402
|
-
|
4440
|
+
// FIXME: this is likely wrong
|
4441
|
+
const int p = pos != nullptr ? pos[i2] : 0;
|
4403
4442
|
|
4404
|
-
const float theta = min(p,
|
4443
|
+
const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
|
4405
4444
|
const float sin_theta = sinf(theta);
|
4406
4445
|
const float cos_theta = cosf(theta);
|
4407
4446
|
|
@@ -4411,7 +4450,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|
4411
4450
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4412
4451
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
4413
4452
|
|
4414
|
-
const float block_theta = max(p -
|
4453
|
+
const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
|
4415
4454
|
const float sin_block_theta = sinf(block_theta);
|
4416
4455
|
const float cos_block_theta = cosf(block_theta);
|
4417
4456
|
|
@@ -4813,6 +4852,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
4813
4852
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4814
4853
|
}
|
4815
4854
|
|
4855
|
+
static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cudaStream_t stream) {
|
4856
|
+
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
4857
|
+
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
4858
|
+
}
|
4859
|
+
|
4816
4860
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4817
4861
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
4818
4862
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
@@ -4822,6 +4866,15 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
|
|
4822
4866
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
4823
4867
|
}
|
4824
4868
|
|
4869
|
+
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
4870
|
+
switch (type) {
|
4871
|
+
case GGML_TYPE_F32:
|
4872
|
+
return convert_fp32_to_fp16_cuda;
|
4873
|
+
default:
|
4874
|
+
return nullptr;
|
4875
|
+
}
|
4876
|
+
}
|
4877
|
+
|
4825
4878
|
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
4826
4879
|
switch (type) {
|
4827
4880
|
case GGML_TYPE_Q4_0:
|
@@ -5348,31 +5401,41 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5348
5401
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5349
5402
|
}
|
5350
5403
|
|
5351
|
-
|
5352
|
-
|
5404
|
+
template<typename T>
|
5405
|
+
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5406
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5353
5407
|
GGML_ASSERT(ncols % 2 == 0);
|
5354
5408
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5355
5409
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5356
5410
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5357
|
-
|
5411
|
+
if (pos == nullptr) {
|
5412
|
+
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5413
|
+
} else {
|
5414
|
+
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5415
|
+
}
|
5358
5416
|
}
|
5359
5417
|
|
5360
|
-
|
5361
|
-
|
5418
|
+
template<typename T>
|
5419
|
+
static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5420
|
+
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
5362
5421
|
GGML_ASSERT(ncols % 2 == 0);
|
5363
5422
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5364
5423
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5365
5424
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5366
|
-
|
5425
|
+
if (pos == nullptr) {
|
5426
|
+
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5427
|
+
} else {
|
5428
|
+
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
5429
|
+
}
|
5367
5430
|
}
|
5368
5431
|
|
5369
|
-
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float
|
5370
|
-
const
|
5432
|
+
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5433
|
+
const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
5371
5434
|
GGML_ASSERT(ncols % 4 == 0);
|
5372
5435
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
5373
5436
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
5374
5437
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
5375
|
-
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols,
|
5438
|
+
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
|
5376
5439
|
}
|
5377
5440
|
|
5378
5441
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
@@ -6003,8 +6066,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6003
6066
|
GGML_ASSERT(src1_ddf_i != nullptr);
|
6004
6067
|
GGML_ASSERT(dst_dd_i != nullptr);
|
6005
6068
|
|
6006
|
-
const float alpha = 1.0f;
|
6007
|
-
const float beta = 0.0f;
|
6008
6069
|
|
6009
6070
|
const int64_t ne00 = src0->ne[0];
|
6010
6071
|
|
@@ -6013,16 +6074,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6013
6074
|
const int64_t ne0 = dst->ne[0];
|
6014
6075
|
const int64_t row_diff = row_high - row_low;
|
6015
6076
|
|
6016
|
-
float * src0_ddq_as_f32;
|
6017
|
-
size_t src0_as = 0;
|
6018
|
-
|
6019
|
-
if (src0->type != GGML_TYPE_F32) {
|
6020
|
-
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6021
|
-
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6022
|
-
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6023
|
-
}
|
6024
|
-
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6025
|
-
|
6026
6077
|
int id;
|
6027
6078
|
CUDA_CHECK(cudaGetDevice(&id));
|
6028
6079
|
|
@@ -6030,16 +6081,72 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6030
6081
|
// ldc == nrows of the matrix that cuBLAS writes into
|
6031
6082
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
6032
6083
|
|
6033
|
-
|
6034
|
-
|
6035
|
-
|
6036
|
-
|
6037
|
-
|
6038
|
-
|
6039
|
-
|
6084
|
+
const int compute_capability = g_compute_capabilities[id];
|
6085
|
+
|
6086
|
+
if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) {
|
6087
|
+
// convert src1 to fp16, multiply as fp16, convert dst to fp32
|
6088
|
+
half * src1_as_f16 = nullptr;
|
6089
|
+
size_t src1_as = 0;
|
6090
|
+
if (src1->type != GGML_TYPE_F16) {
|
6091
|
+
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
6092
|
+
GGML_ASSERT(to_fp16_cuda != nullptr);
|
6093
|
+
size_t ne = src1_ncols*ne10;
|
6094
|
+
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6095
|
+
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6096
|
+
}
|
6097
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
|
6098
|
+
|
6099
|
+
size_t dst_as = 0;
|
6100
|
+
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6101
|
+
|
6102
|
+
const half alpha_f16 = 1.0f;
|
6103
|
+
const half beta_f16 = 0.0f;
|
6104
|
+
|
6105
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6106
|
+
CUBLAS_CHECK(
|
6107
|
+
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6108
|
+
row_diff, src1_ncols, ne10,
|
6109
|
+
&alpha_f16, src0_dd_i, CUDA_R_16F, ne00,
|
6110
|
+
src1_ptr, CUDA_R_16F, ne10,
|
6111
|
+
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
6112
|
+
CUBLAS_COMPUTE_16F,
|
6113
|
+
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
6040
6114
|
|
6041
|
-
|
6042
|
-
|
6115
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
6116
|
+
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
6117
|
+
|
6118
|
+
ggml_cuda_pool_free(dst_f16, dst_as);
|
6119
|
+
|
6120
|
+
if (src1_as != 0) {
|
6121
|
+
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
6122
|
+
}
|
6123
|
+
}
|
6124
|
+
else {
|
6125
|
+
float * src0_ddq_as_f32 = nullptr;
|
6126
|
+
size_t src0_as = 0;
|
6127
|
+
|
6128
|
+
if (src0->type != GGML_TYPE_F32) {
|
6129
|
+
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
6130
|
+
GGML_ASSERT(to_fp32_cuda != nullptr);
|
6131
|
+
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
6132
|
+
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
6133
|
+
}
|
6134
|
+
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
6135
|
+
|
6136
|
+
const float alpha = 1.0f;
|
6137
|
+
const float beta = 0.0f;
|
6138
|
+
|
6139
|
+
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
|
6140
|
+
CUBLAS_CHECK(
|
6141
|
+
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
6142
|
+
row_diff, src1_ncols, ne10,
|
6143
|
+
&alpha, src0_ddf_i, ne00,
|
6144
|
+
src1_ddf_i, ne10,
|
6145
|
+
&beta, dst_dd_i, ldc));
|
6146
|
+
|
6147
|
+
if (src0_as != 0) {
|
6148
|
+
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
6149
|
+
}
|
6043
6150
|
}
|
6044
6151
|
|
6045
6152
|
(void) dst;
|
@@ -6051,14 +6158,16 @@ inline void ggml_cuda_op_rope(
|
|
6051
6158
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6052
6159
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6053
6160
|
|
6054
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6055
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6161
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
6162
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
6163
|
+
GGML_ASSERT(src0->type == dst->type);
|
6056
6164
|
|
6057
6165
|
const int64_t ne00 = src0->ne[0];
|
6058
6166
|
const int64_t ne01 = src0->ne[1];
|
6167
|
+
const int64_t ne2 = dst->ne[2];
|
6059
6168
|
const int64_t nrows = ggml_nrows(src0);
|
6060
6169
|
|
6061
|
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6170
|
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6062
6171
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
6063
6172
|
const int mode = ((int32_t *) dst->op_params)[2];
|
6064
6173
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
@@ -6069,19 +6178,38 @@ inline void ggml_cuda_op_rope(
|
|
6069
6178
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
6070
6179
|
|
6071
6180
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
6072
|
-
|
6181
|
+
|
6182
|
+
const int32_t * pos = nullptr;
|
6183
|
+
if ((mode & 1) == 0) {
|
6184
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
6185
|
+
GGML_ASSERT(src1->ne[0] == ne2);
|
6186
|
+
pos = (const int32_t *) src1_dd;
|
6187
|
+
}
|
6073
6188
|
|
6074
6189
|
const bool is_neox = mode & 2;
|
6075
6190
|
const bool is_glm = mode & 4;
|
6076
6191
|
|
6077
6192
|
// compute
|
6078
6193
|
if (is_glm) {
|
6079
|
-
|
6194
|
+
GGML_ASSERT(false);
|
6195
|
+
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
6080
6196
|
} else if (is_neox) {
|
6081
6197
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6082
|
-
|
6198
|
+
if (src0->type == GGML_TYPE_F32) {
|
6199
|
+
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6200
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6201
|
+
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6202
|
+
} else {
|
6203
|
+
GGML_ASSERT(false);
|
6204
|
+
}
|
6083
6205
|
} else {
|
6084
|
-
|
6206
|
+
if (src0->type == GGML_TYPE_F32) {
|
6207
|
+
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6208
|
+
} else if (src0->type == GGML_TYPE_F16) {
|
6209
|
+
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
6210
|
+
} else {
|
6211
|
+
GGML_ASSERT(false);
|
6212
|
+
}
|
6085
6213
|
}
|
6086
6214
|
|
6087
6215
|
(void) src1;
|
@@ -6252,6 +6380,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6252
6380
|
}
|
6253
6381
|
}
|
6254
6382
|
|
6383
|
+
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
6384
|
+
static bool peer_access_enabled = false;
|
6385
|
+
|
6386
|
+
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
6387
|
+
|
6388
|
+
if (peer_access_enabled == enable_peer_access) {
|
6389
|
+
return;
|
6390
|
+
}
|
6391
|
+
|
6392
|
+
#ifdef NDEBUG
|
6393
|
+
for (int id = 0; id < g_device_count; ++id) {
|
6394
|
+
CUDA_CHECK(ggml_cuda_set_device(id));
|
6395
|
+
|
6396
|
+
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
6397
|
+
if (id == id_other) {
|
6398
|
+
continue;
|
6399
|
+
}
|
6400
|
+
if (id != g_main_device && id_other != g_main_device) {
|
6401
|
+
continue;
|
6402
|
+
}
|
6403
|
+
|
6404
|
+
int can_access_peer;
|
6405
|
+
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
6406
|
+
if (can_access_peer) {
|
6407
|
+
if (enable_peer_access) {
|
6408
|
+
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
6409
|
+
} else {
|
6410
|
+
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
6411
|
+
}
|
6412
|
+
}
|
6413
|
+
}
|
6414
|
+
}
|
6415
|
+
#endif // NDEBUG
|
6416
|
+
|
6417
|
+
peer_access_enabled = enable_peer_access;
|
6418
|
+
}
|
6419
|
+
|
6255
6420
|
static void ggml_cuda_op_mul_mat(
|
6256
6421
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
6257
6422
|
const bool convert_src1_to_q8_1) {
|
@@ -6276,6 +6441,8 @@ static void ggml_cuda_op_mul_mat(
|
|
6276
6441
|
const int nb2 = dst->nb[2];
|
6277
6442
|
const int nb3 = dst->nb[3];
|
6278
6443
|
|
6444
|
+
ggml_cuda_set_peer_access(ne11);
|
6445
|
+
|
6279
6446
|
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6280
6447
|
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6281
6448
|
|
@@ -6408,7 +6575,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6408
6575
|
|
6409
6576
|
// wait for main GPU data if necessary
|
6410
6577
|
if (split && (id != g_main_device || is != 0)) {
|
6411
|
-
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
|
6578
|
+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
|
6412
6579
|
}
|
6413
6580
|
|
6414
6581
|
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
|
@@ -6530,7 +6697,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6530
6697
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6531
6698
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
6532
6699
|
for (int64_t is = 0; is < is_max; ++is) {
|
6533
|
-
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
|
6700
|
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
6534
6701
|
}
|
6535
6702
|
}
|
6536
6703
|
}
|
@@ -6541,27 +6708,27 @@ static void ggml_cuda_op_mul_mat(
|
|
6541
6708
|
}
|
6542
6709
|
}
|
6543
6710
|
|
6544
|
-
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6711
|
+
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6545
6712
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6546
6713
|
}
|
6547
6714
|
|
6548
|
-
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6715
|
+
static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6549
6716
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
|
6550
6717
|
}
|
6551
6718
|
|
6552
|
-
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6719
|
+
static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6553
6720
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
|
6554
6721
|
}
|
6555
6722
|
|
6556
|
-
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6723
|
+
static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6557
6724
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
6558
6725
|
}
|
6559
6726
|
|
6560
|
-
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6727
|
+
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6561
6728
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
6562
6729
|
}
|
6563
6730
|
|
6564
|
-
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6731
|
+
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6565
6732
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
6566
6733
|
}
|
6567
6734
|
|
@@ -6572,17 +6739,13 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
|
|
6572
6739
|
const int64_t ne1 = dst->ne[1];
|
6573
6740
|
|
6574
6741
|
// TODO: find the optimal values for these
|
6575
|
-
|
6576
|
-
|
6577
|
-
|
6578
|
-
|
6579
|
-
return true;
|
6580
|
-
}
|
6581
|
-
|
6582
|
-
return false;
|
6742
|
+
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
6743
|
+
src1->type == GGML_TYPE_F32 &&
|
6744
|
+
dst->type == GGML_TYPE_F32 &&
|
6745
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
6583
6746
|
}
|
6584
6747
|
|
6585
|
-
void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6748
|
+
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6586
6749
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
6587
6750
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
6588
6751
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
@@ -6611,7 +6774,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
6611
6774
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
6612
6775
|
}
|
6613
6776
|
|
6614
|
-
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6777
|
+
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
6615
6778
|
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
6616
6779
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
6617
6780
|
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
@@ -6645,7 +6808,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6645
6808
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
6646
6809
|
}
|
6647
6810
|
|
6648
|
-
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6811
|
+
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6649
6812
|
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
6650
6813
|
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
6651
6814
|
|
@@ -6689,11 +6852,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
|
6689
6852
|
}
|
6690
6853
|
}
|
6691
6854
|
|
6692
|
-
void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6855
|
+
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6693
6856
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6694
6857
|
}
|
6695
6858
|
|
6696
|
-
void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6859
|
+
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6697
6860
|
const int64_t ne = ggml_nelements(src0);
|
6698
6861
|
GGML_ASSERT(ne == ggml_nelements(src1));
|
6699
6862
|
|
@@ -6735,35 +6898,37 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
6735
6898
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
6736
6899
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
6737
6900
|
} else {
|
6901
|
+
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
6902
|
+
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
6738
6903
|
GGML_ASSERT(false);
|
6739
6904
|
}
|
6740
6905
|
|
6741
6906
|
(void) dst;
|
6742
6907
|
}
|
6743
6908
|
|
6744
|
-
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6909
|
+
static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6745
6910
|
ggml_cuda_cpy(src0, dst, nullptr);
|
6746
6911
|
(void) src1;
|
6747
6912
|
}
|
6748
6913
|
|
6749
|
-
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6914
|
+
static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6750
6915
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
|
6751
6916
|
}
|
6752
6917
|
|
6753
|
-
void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6918
|
+
static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6754
6919
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
|
6755
6920
|
}
|
6756
6921
|
|
6757
|
-
void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6922
|
+
static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6758
6923
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
6759
6924
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
|
6760
6925
|
}
|
6761
6926
|
|
6762
|
-
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6927
|
+
static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6763
6928
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
6764
6929
|
}
|
6765
6930
|
|
6766
|
-
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6931
|
+
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6767
6932
|
(void) src0;
|
6768
6933
|
(void) src1;
|
6769
6934
|
(void) dst;
|
@@ -6886,11 +7051,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
6886
7051
|
return extra;
|
6887
7052
|
}
|
6888
7053
|
|
6889
|
-
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
7054
|
+
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
6890
7055
|
if (scratch && g_scratch_size == 0) {
|
6891
7056
|
return;
|
6892
7057
|
}
|
6893
7058
|
|
7059
|
+
tensor->backend = GGML_BACKEND_GPU;
|
7060
|
+
|
6894
7061
|
// recursively assign CUDA buffers until a compute tensor is found
|
6895
7062
|
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
6896
7063
|
const ggml_op src0_op = tensor->src[0]->op;
|
@@ -6902,8 +7069,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
6902
7069
|
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
6903
7070
|
}
|
6904
7071
|
|
6905
|
-
tensor->backend = GGML_BACKEND_GPU;
|
6906
|
-
|
6907
7072
|
if (scratch && no_alloc) {
|
6908
7073
|
return;
|
6909
7074
|
}
|
@@ -6964,6 +7129,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
6964
7129
|
return;
|
6965
7130
|
}
|
6966
7131
|
if (g_scratch_buffer == nullptr) {
|
7132
|
+
ggml_cuda_set_device(g_main_device);
|
6967
7133
|
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
6968
7134
|
}
|
6969
7135
|
|
@@ -6987,6 +7153,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
6987
7153
|
tensor->extra = extra;
|
6988
7154
|
}
|
6989
7155
|
|
7156
|
+
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
7157
|
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7158
|
+
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7159
|
+
|
7160
|
+
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7161
|
+
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7162
|
+
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7163
|
+
}
|
7164
|
+
|
6990
7165
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
6991
7166
|
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
6992
7167
|
}
|
@@ -7003,7 +7178,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
|
7003
7178
|
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
7004
7179
|
}
|
7005
7180
|
|
7006
|
-
void ggml_cuda_set_main_device(int main_device) {
|
7181
|
+
void ggml_cuda_set_main_device(const int main_device) {
|
7007
7182
|
if (main_device >= g_device_count) {
|
7008
7183
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
7009
7184
|
main_device, g_device_count, g_main_device);
|
@@ -7017,12 +7192,17 @@ void ggml_cuda_set_main_device(int main_device) {
|
|
7017
7192
|
}
|
7018
7193
|
}
|
7019
7194
|
|
7020
|
-
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
7195
|
+
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
7021
7196
|
g_mul_mat_q = mul_mat_q;
|
7022
7197
|
}
|
7023
7198
|
|
7024
|
-
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
7025
|
-
|
7199
|
+
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
7200
|
+
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
7201
|
+
// it still won't always work as expected, but it's better than nothing
|
7202
|
+
if (scratch_size > g_scratch_size) {
|
7203
|
+
ggml_cuda_free_scratch();
|
7204
|
+
}
|
7205
|
+
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
7026
7206
|
}
|
7027
7207
|
|
7028
7208
|
void ggml_cuda_free_scratch() {
|