llama_cpp 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -81,12 +82,15 @@
|
|
81
82
|
|
82
83
|
#include "ggml-cuda.h"
|
83
84
|
#include "ggml.h"
|
85
|
+
#include "ggml-backend-impl.h"
|
84
86
|
|
85
87
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
88
|
#define CC_VOLTA 700
|
87
89
|
#define CC_OFFSET_AMD 1000000
|
88
90
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
91
|
|
92
|
+
#define GGML_CUDA_MAX_NODES 8192
|
93
|
+
|
90
94
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
95
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
96
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -232,7 +236,7 @@ typedef float2 dfloat2;
|
|
232
236
|
#endif //GGML_CUDA_F16
|
233
237
|
|
234
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
235
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
236
240
|
|
237
241
|
int x32 = 0;
|
238
242
|
x32 |= x16[0] << 0;
|
@@ -242,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
242
246
|
}
|
243
247
|
|
244
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
245
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
246
250
|
|
247
251
|
int x32 = 0;
|
248
252
|
x32 |= x16[0] << 0;
|
@@ -252,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
252
256
|
}
|
253
257
|
|
254
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
255
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
256
260
|
}
|
257
261
|
|
258
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
259
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
260
264
|
}
|
261
265
|
|
262
266
|
template<typename T>
|
@@ -433,6 +437,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
437
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
441
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
442
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -464,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
464
470
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
465
471
|
|
466
472
|
#define MAX_STREAMS 8
|
467
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
473
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
468
474
|
|
469
475
|
struct ggml_tensor_extra_gpu {
|
470
476
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -553,6 +559,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
559
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
560
|
}
|
555
561
|
|
562
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
563
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
564
|
+
|
565
|
+
if (i >= k) {
|
566
|
+
return;
|
567
|
+
}
|
568
|
+
dst[i] = fmaxf(x[i], 0);
|
569
|
+
}
|
570
|
+
|
571
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
572
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
573
|
+
|
574
|
+
if (i >= k) {
|
575
|
+
return;
|
576
|
+
}
|
577
|
+
dst[i] = x[i] * x[i];
|
578
|
+
}
|
579
|
+
|
556
580
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
581
|
#pragma unroll
|
558
582
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -2225,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2225
2249
|
}
|
2226
2250
|
|
2227
2251
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2252
|
+
(void)x_qh; (void)x_sc;
|
2228
2253
|
|
2229
2254
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2230
2255
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2236,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2236
2261
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2237
2262
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2238
2263
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2239
|
-
|
2264
|
+
(void)x_qh; (void)x_sc;
|
2240
2265
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2241
2266
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2242
2267
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2245,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2245
2270
|
const int kbx = k / QI4_0;
|
2246
2271
|
const int kqsx = k % QI4_0;
|
2247
2272
|
|
2248
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2273
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2249
2274
|
|
2250
2275
|
float * x_dmf = (float *) x_dm;
|
2251
2276
|
|
@@ -2283,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2283
2308
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2284
2309
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2285
2310
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2311
|
+
(void)x_qh; (void)x_sc;
|
2286
2312
|
|
2287
2313
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2288
|
-
const float * x_dmf = (float *) x_dm;
|
2314
|
+
const float * x_dmf = (const float *) x_dm;
|
2289
2315
|
|
2290
2316
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2291
2317
|
|
@@ -2319,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2319
2345
|
}
|
2320
2346
|
|
2321
2347
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2348
|
+
(void)x_qh; (void)x_sc;
|
2322
2349
|
|
2323
2350
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2324
2351
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2330,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2330
2357
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2331
2358
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332
2359
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2360
|
+
(void)x_qh; (void)x_sc;
|
2333
2361
|
|
2334
2362
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2335
2363
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2339,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2339
2367
|
const int kbx = k / QI4_1;
|
2340
2368
|
const int kqsx = k % QI4_1;
|
2341
2369
|
|
2342
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2370
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2343
2371
|
|
2344
2372
|
#pragma unroll
|
2345
2373
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2374,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2374
2402
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2375
2403
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2404
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2405
|
+
(void)x_qh; (void)x_sc;
|
2377
2406
|
|
2378
2407
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2379
2408
|
|
@@ -2411,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2411
2440
|
}
|
2412
2441
|
|
2413
2442
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2443
|
+
(void)x_qh; (void)x_sc;
|
2414
2444
|
|
2415
2445
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2416
2446
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2422,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2422
2452
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2423
2453
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2424
2454
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2455
|
+
(void)x_qh; (void)x_sc;
|
2425
2456
|
|
2426
2457
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2427
2458
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2431,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2431
2462
|
const int kbx = k / QI5_0;
|
2432
2463
|
const int kqsx = k % QI5_0;
|
2433
2464
|
|
2434
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2465
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2435
2466
|
|
2436
2467
|
#pragma unroll
|
2437
2468
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2486,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2486
2517
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2487
2518
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2488
2519
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2520
|
+
(void)x_qh; (void)x_sc;
|
2489
2521
|
|
2490
2522
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2491
2523
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2525,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2525
2557
|
}
|
2526
2558
|
|
2527
2559
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2560
|
+
(void)x_qh; (void)x_sc;
|
2528
2561
|
|
2529
2562
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2530
2563
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2536,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2536
2569
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2537
2570
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2538
2571
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2572
|
+
(void)x_qh; (void)x_sc;
|
2539
2573
|
|
2540
2574
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2541
2575
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2545,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2545
2579
|
const int kbx = k / QI5_1;
|
2546
2580
|
const int kqsx = k % QI5_1;
|
2547
2581
|
|
2548
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2582
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2549
2583
|
|
2550
2584
|
#pragma unroll
|
2551
2585
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2597,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2597
2631
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2598
2632
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2599
2633
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2634
|
+
(void)x_qh; (void)x_sc;
|
2600
2635
|
|
2601
2636
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2602
2637
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2631,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2631
2666
|
}
|
2632
2667
|
|
2633
2668
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2669
|
+
(void)x_qh; (void)x_sc;
|
2634
2670
|
|
2635
2671
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2636
2672
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2642,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2642
2678
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2643
2679
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2644
2680
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2681
|
+
(void)x_qh; (void)x_sc;
|
2645
2682
|
|
2646
2683
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2647
2684
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2652,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2652
2689
|
const int kqsx = k % QI8_0;
|
2653
2690
|
float * x_dmf = (float *) x_dm;
|
2654
2691
|
|
2655
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2692
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2656
2693
|
|
2657
2694
|
#pragma unroll
|
2658
2695
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2687,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2687
2724
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2688
2725
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2689
2726
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2727
|
+
(void)x_qh; (void)x_sc;
|
2690
2728
|
|
2691
2729
|
const float * x_dmf = (const float *) x_dm;
|
2692
2730
|
const float * y_df = (const float *) y_ds;
|
@@ -2720,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2720
2758
|
}
|
2721
2759
|
|
2722
2760
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2761
|
+
(void)x_qh;
|
2723
2762
|
|
2724
2763
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2725
2764
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2733,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2733
2772
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2734
2773
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2735
2774
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2775
|
+
(void)x_qh;
|
2736
2776
|
|
2737
2777
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2738
2778
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2742,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2742
2782
|
const int kbx = k / QI2_K;
|
2743
2783
|
const int kqsx = k % QI2_K;
|
2744
2784
|
|
2745
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2785
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2746
2786
|
|
2747
2787
|
#pragma unroll
|
2748
2788
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2790,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2790
2830
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2791
2831
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2792
2832
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2833
|
+
(void)x_qh;
|
2793
2834
|
|
2794
2835
|
const int kbx = k / QI2_K;
|
2795
2836
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2863,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2863
2904
|
const int kbx = k / QI3_K;
|
2864
2905
|
const int kqsx = k % QI3_K;
|
2865
2906
|
|
2866
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2907
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2867
2908
|
|
2868
2909
|
#pragma unroll
|
2869
2910
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2944,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2944
2985
|
const float * x_dmf = (const float *) x_dm;
|
2945
2986
|
const float * y_df = (const float *) y_ds;
|
2946
2987
|
|
2947
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2988
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2948
2989
|
|
2949
2990
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2950
2991
|
|
@@ -3059,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3059
3100
|
}
|
3060
3101
|
|
3061
3102
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3103
|
+
(void)x_qh;
|
3062
3104
|
|
3063
3105
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3064
3106
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3072,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3072
3114
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3073
3115
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3074
3116
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3117
|
+
(void)x_qh;
|
3075
3118
|
|
3076
3119
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3077
3120
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3081,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3081
3124
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3082
3125
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3083
3126
|
|
3084
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3127
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3085
3128
|
|
3086
3129
|
#pragma unroll
|
3087
3130
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3126,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3126
3169
|
|
3127
3170
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3128
3171
|
|
3129
|
-
const int * scales = (int *) bxi->scales;
|
3172
|
+
const int * scales = (const int *) bxi->scales;
|
3130
3173
|
|
3131
3174
|
const int ksc = k % (WARP_SIZE/8);
|
3132
3175
|
|
@@ -3141,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3141
3184
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3142
3185
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3143
3186
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3187
|
+
(void)x_qh;
|
3144
3188
|
|
3145
3189
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3146
3190
|
|
@@ -3240,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3240
3284
|
}
|
3241
3285
|
|
3242
3286
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3287
|
+
(void)x_qh;
|
3243
3288
|
|
3244
3289
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3245
3290
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3253,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3253
3298
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3254
3299
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3255
3300
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3301
|
+
(void)x_qh;
|
3256
3302
|
|
3257
3303
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3258
3304
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3262,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3262
3308
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3263
3309
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3264
3310
|
|
3265
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3311
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3266
3312
|
|
3267
3313
|
#pragma unroll
|
3268
3314
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3318,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3318
3364
|
|
3319
3365
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3320
3366
|
|
3321
|
-
const int * scales = (int *) bxi->scales;
|
3367
|
+
const int * scales = (const int *) bxi->scales;
|
3322
3368
|
|
3323
3369
|
const int ksc = k % (WARP_SIZE/8);
|
3324
3370
|
|
@@ -3333,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3333
3379
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3334
3380
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3335
3381
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3382
|
+
(void)x_qh;
|
3336
3383
|
|
3337
3384
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3338
3385
|
|
@@ -3369,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3369
3416
|
}
|
3370
3417
|
|
3371
3418
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3419
|
+
(void)x_qh;
|
3372
3420
|
|
3373
3421
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3374
3422
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3382,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3382
3430
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3383
3431
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3384
3432
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3433
|
+
(void)x_qh;
|
3385
3434
|
|
3386
3435
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3387
3436
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3391,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3391
3440
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3392
3441
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3393
3442
|
|
3394
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3443
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3395
3444
|
|
3396
3445
|
#pragma unroll
|
3397
3446
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3453,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3453
3502
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3454
3503
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3455
3504
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3505
|
+
(void)x_qh;
|
3456
3506
|
|
3457
3507
|
const float * x_dmf = (const float *) x_dm;
|
3458
3508
|
const float * y_df = (const float *) y_ds;
|
@@ -3495,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3495
3545
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3496
3546
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3497
3547
|
|
3498
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3548
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3499
3549
|
|
3500
3550
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3501
3551
|
|
@@ -4468,6 +4518,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4518
|
*dsti = __float2half(*xi);
|
4469
4519
|
}
|
4470
4520
|
|
4521
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4522
|
+
const half * xi = (const half *) cxi;
|
4523
|
+
half * dsti = (half *) cdsti;
|
4524
|
+
|
4525
|
+
*dsti = *xi;
|
4526
|
+
}
|
4527
|
+
|
4471
4528
|
template <cpy_kernel_t cpy_1>
|
4472
4529
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4530
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4778,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4778
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4779
|
}
|
4723
4780
|
|
4781
|
+
static __global__ void im2col_f32_f16(
|
4782
|
+
const float * x, half * dst,
|
4783
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4784
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4785
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4786
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4787
|
+
|
4788
|
+
const int offset_dst =
|
4789
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4790
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4791
|
+
|
4792
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4793
|
+
dst[offset_dst] = __float2half(0.0f);
|
4794
|
+
} else {
|
4795
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4796
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4797
|
+
}
|
4798
|
+
}
|
4799
|
+
|
4724
4800
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4801
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4802
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4835,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4835
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4836
|
}
|
4761
4837
|
|
4838
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4839
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4840
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4841
|
+
}
|
4842
|
+
|
4843
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4844
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4845
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4846
|
+
}
|
4847
|
+
|
4762
4848
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4849
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4850
|
if (ncols < 1024) {
|
@@ -5611,6 +5697,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5611
5697
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5612
5698
|
}
|
5613
5699
|
|
5700
|
+
static void ggml_cpy_f16_f16_cuda(
|
5701
|
+
const char * cx, char * cdst, const int ne,
|
5702
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5703
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5704
|
+
|
5705
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5706
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5707
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5708
|
+
}
|
5709
|
+
|
5614
5710
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5615
5711
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5616
5712
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5694,6 +5790,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5694
5790
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5695
5791
|
}
|
5696
5792
|
|
5793
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5794
|
+
int OH, int IW, int IH, int OW, int IC,
|
5795
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5796
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5797
|
+
dim3 block_nums(IC, OH, OW);
|
5798
|
+
dim3 block_dims(N, KH, KW);
|
5799
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5800
|
+
}
|
5801
|
+
|
5697
5802
|
// buffer pool for cuda
|
5698
5803
|
#define MAX_CUDA_BUFFERS 256
|
5699
5804
|
|
@@ -5762,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5762
5867
|
return ptr;
|
5763
5868
|
}
|
5764
5869
|
#ifdef DEBUG_CUDA_MALLOC
|
5765
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5870
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5766
5871
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5767
5872
|
#endif
|
5768
5873
|
void * ptr;
|
@@ -5900,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5900
6005
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5901
6006
|
// This can fixed the OOM error in WSL.
|
5902
6007
|
cudaGetLastError();
|
5903
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6008
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5904
6009
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5905
6010
|
return nullptr;
|
5906
6011
|
}
|
@@ -5945,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5945
6050
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
5946
6051
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
5947
6052
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
5948
|
-
}
|
6053
|
+
}
|
6054
|
+
if (nb0 == ts) {
|
5949
6055
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
5950
|
-
} else {
|
5951
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
5952
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
5953
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
5954
|
-
// pretend the row is a matrix with cols=1
|
5955
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
5956
|
-
if (r != cudaSuccess) return r;
|
5957
|
-
}
|
5958
|
-
return cudaSuccess;
|
5959
6056
|
}
|
6057
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6058
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6059
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6060
|
+
// pretend the row is a matrix with cols=1
|
6061
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6062
|
+
if (r != cudaSuccess) { return r; }
|
6063
|
+
}
|
6064
|
+
return cudaSuccess;
|
5960
6065
|
}
|
5961
6066
|
|
5962
6067
|
static void ggml_cuda_op_repeat(
|
@@ -6128,6 +6233,34 @@ inline void ggml_cuda_op_silu(
|
|
6128
6233
|
(void) src1_dd;
|
6129
6234
|
}
|
6130
6235
|
|
6236
|
+
inline void ggml_cuda_op_relu(
|
6237
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6238
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6239
|
+
|
6240
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6241
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6242
|
+
|
6243
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6244
|
+
|
6245
|
+
(void) src1;
|
6246
|
+
(void) dst;
|
6247
|
+
(void) src1_dd;
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
inline void ggml_cuda_op_sqr(
|
6251
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6252
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6253
|
+
|
6254
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6255
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6256
|
+
|
6257
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6258
|
+
|
6259
|
+
(void) src1;
|
6260
|
+
(void) dst;
|
6261
|
+
(void) src1_dd;
|
6262
|
+
}
|
6263
|
+
|
6131
6264
|
inline void ggml_cuda_op_norm(
|
6132
6265
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6133
6266
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6250,6 +6383,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6250
6383
|
case GGML_TYPE_Q8_0:
|
6251
6384
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6252
6385
|
case GGML_TYPE_F16:
|
6386
|
+
case GGML_TYPE_F32:
|
6253
6387
|
return 1;
|
6254
6388
|
case GGML_TYPE_Q2_K:
|
6255
6389
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6272,6 +6406,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6272
6406
|
case GGML_TYPE_Q8_0:
|
6273
6407
|
return 64;
|
6274
6408
|
case GGML_TYPE_F16:
|
6409
|
+
case GGML_TYPE_F32:
|
6275
6410
|
return 1;
|
6276
6411
|
case GGML_TYPE_Q2_K:
|
6277
6412
|
case GGML_TYPE_Q3_K:
|
@@ -6463,8 +6598,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6463
6598
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6464
6599
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6465
6600
|
}
|
6466
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6467
|
-
|
6601
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6468
6602
|
size_t dst_as = 0;
|
6469
6603
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6470
6604
|
|
@@ -6639,6 +6773,45 @@ inline void ggml_cuda_op_alibi(
|
|
6639
6773
|
(void) src1_dd;
|
6640
6774
|
}
|
6641
6775
|
|
6776
|
+
inline void ggml_cuda_op_im2col(
|
6777
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6778
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6779
|
+
|
6780
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6781
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6782
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6783
|
+
|
6784
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6785
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6786
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6787
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6788
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6789
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6790
|
+
|
6791
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6792
|
+
|
6793
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6794
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6795
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6796
|
+
const int64_t IW = src1->ne[0];
|
6797
|
+
|
6798
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6799
|
+
const int64_t KW = src0->ne[0];
|
6800
|
+
|
6801
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6802
|
+
const int64_t OW = dst->ne[1];
|
6803
|
+
|
6804
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6805
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6806
|
+
|
6807
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6808
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6809
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6810
|
+
|
6811
|
+
(void) src0;
|
6812
|
+
(void) src0_dd;
|
6813
|
+
}
|
6814
|
+
|
6642
6815
|
inline void ggml_cuda_op_diag_mask_inf(
|
6643
6816
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6644
6817
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6843,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6843
7016
|
const int64_t ne01 = src0->ne[1];
|
6844
7017
|
const int64_t ne02 = src0->ne[2];
|
6845
7018
|
const int64_t ne03 = src0->ne[3];
|
6846
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7019
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6847
7020
|
|
6848
7021
|
const int64_t ne10 = src1->ne[0];
|
6849
7022
|
const int64_t ne11 = src1->ne[1];
|
@@ -6944,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6944
7117
|
if (src0_on_device && src0_is_contiguous) {
|
6945
7118
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
6946
7119
|
} else {
|
6947
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7120
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6948
7121
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
6949
7122
|
}
|
6950
7123
|
|
@@ -7160,6 +7333,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7160
7333
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7161
7334
|
}
|
7162
7335
|
|
7336
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7337
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7338
|
+
}
|
7339
|
+
|
7340
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7341
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7342
|
+
}
|
7343
|
+
|
7163
7344
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7164
7345
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7165
7346
|
}
|
@@ -7169,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7169
7350
|
}
|
7170
7351
|
|
7171
7352
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
-
if (!g_cublas_loaded) return false;
|
7353
|
+
if (!g_cublas_loaded) { return false; }
|
7173
7354
|
|
7174
7355
|
const int64_t ne10 = src1->ne[0];
|
7175
7356
|
|
@@ -7247,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7247
7428
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7248
7429
|
}
|
7249
7430
|
|
7250
|
-
__global__ void k_compute_batched_ptrs(
|
7431
|
+
__global__ static void k_compute_batched_ptrs(
|
7251
7432
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7252
7433
|
const void ** ptrs_src, void ** ptrs_dst,
|
7253
7434
|
int ne12, int ne13,
|
@@ -7543,6 +7724,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7543
7724
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7544
7725
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7545
7726
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7727
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7728
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7729
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7546
7730
|
} else {
|
7547
7731
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7548
7732
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7574,6 +7758,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7574
7758
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7575
7759
|
}
|
7576
7760
|
|
7761
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7762
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7763
|
+
}
|
7764
|
+
|
7577
7765
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7578
7766
|
(void) src0;
|
7579
7767
|
(void) src1;
|
@@ -7685,11 +7873,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7685
7873
|
|
7686
7874
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7687
7875
|
if (g_temp_tensor_extras == nullptr) {
|
7688
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7876
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7689
7877
|
}
|
7690
7878
|
|
7691
7879
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7692
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7880
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7693
7881
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7694
7882
|
memset(extra, 0, sizeof(*extra));
|
7695
7883
|
|
@@ -7856,7 +8044,7 @@ void ggml_cuda_free_scratch() {
|
|
7856
8044
|
}
|
7857
8045
|
|
7858
8046
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
-
if (!g_cublas_loaded) return false;
|
8047
|
+
if (!g_cublas_loaded) { return false; }
|
7860
8048
|
|
7861
8049
|
ggml_cuda_func_t func;
|
7862
8050
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -7867,6 +8055,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7867
8055
|
return false;
|
7868
8056
|
}
|
7869
8057
|
|
8058
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8059
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8060
|
+
#ifndef NDEBUG
|
8061
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8062
|
+
#endif
|
8063
|
+
return false;
|
8064
|
+
}
|
8065
|
+
}
|
8066
|
+
|
7870
8067
|
switch (tensor->op) {
|
7871
8068
|
case GGML_OP_REPEAT:
|
7872
8069
|
func = ggml_cuda_repeat;
|
@@ -7891,6 +8088,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7891
8088
|
case GGML_UNARY_OP_SILU:
|
7892
8089
|
func = ggml_cuda_silu;
|
7893
8090
|
break;
|
8091
|
+
case GGML_UNARY_OP_RELU:
|
8092
|
+
func = ggml_cuda_relu;
|
8093
|
+
break;
|
7894
8094
|
default:
|
7895
8095
|
return false;
|
7896
8096
|
} break;
|
@@ -7909,6 +8109,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7909
8109
|
case GGML_OP_SCALE:
|
7910
8110
|
func = ggml_cuda_scale;
|
7911
8111
|
break;
|
8112
|
+
case GGML_OP_SQR:
|
8113
|
+
func = ggml_cuda_sqr;
|
8114
|
+
break;
|
7912
8115
|
case GGML_OP_CLAMP:
|
7913
8116
|
if (!any_on_device) {
|
7914
8117
|
return false;
|
@@ -7939,6 +8142,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7939
8142
|
case GGML_OP_ALIBI:
|
7940
8143
|
func = ggml_cuda_alibi;
|
7941
8144
|
break;
|
8145
|
+
case GGML_OP_IM2COL:
|
8146
|
+
func = ggml_cuda_im2col;
|
8147
|
+
break;
|
7942
8148
|
default:
|
7943
8149
|
return false;
|
7944
8150
|
}
|
@@ -7998,11 +8204,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7998
8204
|
|
7999
8205
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
8000
8206
|
if (temp_tensor_extras == nullptr) {
|
8001
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8207
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
8002
8208
|
}
|
8003
8209
|
|
8004
8210
|
size_t alloc_index = temp_tensor_extra_index;
|
8005
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8211
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
8006
8212
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
8007
8213
|
memset(extra, 0, sizeof(*extra));
|
8008
8214
|
|
@@ -8088,7 +8294,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8088
8294
|
ggml_cuda_set_device(g_main_device);
|
8089
8295
|
|
8090
8296
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8297
|
+
|
8298
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8299
|
+
|
8300
|
+
ggml_cuda_set_device(g_main_device);
|
8091
8301
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8302
|
+
|
8092
8303
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8093
8304
|
}
|
8094
8305
|
|
@@ -8132,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8132
8343
|
UNUSED(cgraph);
|
8133
8344
|
}
|
8134
8345
|
|
8135
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8346
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8136
8347
|
GGML_ASSERT(!"not implemented");
|
8137
8348
|
|
8138
8349
|
UNUSED(backend);
|
8139
8350
|
UNUSED(plan);
|
8140
8351
|
}
|
8141
8352
|
|
8142
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8353
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8143
8354
|
GGML_ASSERT(!"not implemented");
|
8144
8355
|
|
8145
8356
|
UNUSED(backend);
|
@@ -8155,6 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8155
8366
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8156
8367
|
ggml_tensor * node = cgraph->nodes[i];
|
8157
8368
|
|
8369
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8370
|
+
continue;
|
8371
|
+
}
|
8158
8372
|
assert(node->backend == GGML_BACKEND_GPU);
|
8159
8373
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8160
8374
|
if (node->src[j] != nullptr) {
|