llama_cpp 0.9.2 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -81,12 +82,15 @@
|
|
81
82
|
|
82
83
|
#include "ggml-cuda.h"
|
83
84
|
#include "ggml.h"
|
85
|
+
#include "ggml-backend-impl.h"
|
84
86
|
|
85
87
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
86
88
|
#define CC_VOLTA 700
|
87
89
|
#define CC_OFFSET_AMD 1000000
|
88
90
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
89
91
|
|
92
|
+
#define GGML_CUDA_MAX_NODES 8192
|
93
|
+
|
90
94
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
91
95
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
92
96
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
@@ -232,7 +236,7 @@ typedef float2 dfloat2;
|
|
232
236
|
#endif //GGML_CUDA_F16
|
233
237
|
|
234
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
235
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
236
240
|
|
237
241
|
int x32 = 0;
|
238
242
|
x32 |= x16[0] << 0;
|
@@ -242,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
242
246
|
}
|
243
247
|
|
244
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
245
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
246
250
|
|
247
251
|
int x32 = 0;
|
248
252
|
x32 |= x16[0] << 0;
|
@@ -252,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
252
256
|
}
|
253
257
|
|
254
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
255
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
256
260
|
}
|
257
261
|
|
258
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
259
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
260
264
|
}
|
261
265
|
|
262
266
|
template<typename T>
|
@@ -433,6 +437,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
433
437
|
#define CUDA_MUL_BLOCK_SIZE 256
|
434
438
|
#define CUDA_GELU_BLOCK_SIZE 256
|
435
439
|
#define CUDA_SILU_BLOCK_SIZE 256
|
440
|
+
#define CUDA_RELU_BLOCK_SIZE 256
|
441
|
+
#define CUDA_SQR_BLOCK_SIZE 256
|
436
442
|
#define CUDA_CPY_BLOCK_SIZE 32
|
437
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
438
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
@@ -464,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
464
470
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
465
471
|
|
466
472
|
#define MAX_STREAMS 8
|
467
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
473
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
468
474
|
|
469
475
|
struct ggml_tensor_extra_gpu {
|
470
476
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -553,6 +559,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|
553
559
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
554
560
|
}
|
555
561
|
|
562
|
+
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
563
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
564
|
+
|
565
|
+
if (i >= k) {
|
566
|
+
return;
|
567
|
+
}
|
568
|
+
dst[i] = fmaxf(x[i], 0);
|
569
|
+
}
|
570
|
+
|
571
|
+
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
572
|
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
573
|
+
|
574
|
+
if (i >= k) {
|
575
|
+
return;
|
576
|
+
}
|
577
|
+
dst[i] = x[i] * x[i];
|
578
|
+
}
|
579
|
+
|
556
580
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
557
581
|
#pragma unroll
|
558
582
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
@@ -2225,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2225
2249
|
}
|
2226
2250
|
|
2227
2251
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2252
|
+
(void)x_qh; (void)x_sc;
|
2228
2253
|
|
2229
2254
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2230
2255
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2236,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2236
2261
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2237
2262
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2238
2263
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2239
|
-
|
2264
|
+
(void)x_qh; (void)x_sc;
|
2240
2265
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2241
2266
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2242
2267
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2245,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2245
2270
|
const int kbx = k / QI4_0;
|
2246
2271
|
const int kqsx = k % QI4_0;
|
2247
2272
|
|
2248
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2273
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2249
2274
|
|
2250
2275
|
float * x_dmf = (float *) x_dm;
|
2251
2276
|
|
@@ -2283,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2283
2308
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2284
2309
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2285
2310
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2311
|
+
(void)x_qh; (void)x_sc;
|
2286
2312
|
|
2287
2313
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2288
|
-
const float * x_dmf = (float *) x_dm;
|
2314
|
+
const float * x_dmf = (const float *) x_dm;
|
2289
2315
|
|
2290
2316
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2291
2317
|
|
@@ -2319,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2319
2345
|
}
|
2320
2346
|
|
2321
2347
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2348
|
+
(void)x_qh; (void)x_sc;
|
2322
2349
|
|
2323
2350
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2324
2351
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2330,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2330
2357
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2331
2358
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2332
2359
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2360
|
+
(void)x_qh; (void)x_sc;
|
2333
2361
|
|
2334
2362
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2335
2363
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2339,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2339
2367
|
const int kbx = k / QI4_1;
|
2340
2368
|
const int kqsx = k % QI4_1;
|
2341
2369
|
|
2342
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2370
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2343
2371
|
|
2344
2372
|
#pragma unroll
|
2345
2373
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2374,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2374
2402
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2375
2403
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2376
2404
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2405
|
+
(void)x_qh; (void)x_sc;
|
2377
2406
|
|
2378
2407
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2379
2408
|
|
@@ -2411,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2411
2440
|
}
|
2412
2441
|
|
2413
2442
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2443
|
+
(void)x_qh; (void)x_sc;
|
2414
2444
|
|
2415
2445
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2416
2446
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2422,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2422
2452
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2423
2453
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2424
2454
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2455
|
+
(void)x_qh; (void)x_sc;
|
2425
2456
|
|
2426
2457
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2427
2458
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2431,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2431
2462
|
const int kbx = k / QI5_0;
|
2432
2463
|
const int kqsx = k % QI5_0;
|
2433
2464
|
|
2434
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2465
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2435
2466
|
|
2436
2467
|
#pragma unroll
|
2437
2468
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2486,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2486
2517
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2487
2518
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2488
2519
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2520
|
+
(void)x_qh; (void)x_sc;
|
2489
2521
|
|
2490
2522
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2491
2523
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2525,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2525
2557
|
}
|
2526
2558
|
|
2527
2559
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2560
|
+
(void)x_qh; (void)x_sc;
|
2528
2561
|
|
2529
2562
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2530
2563
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2536,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2536
2569
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2537
2570
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2538
2571
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2572
|
+
(void)x_qh; (void)x_sc;
|
2539
2573
|
|
2540
2574
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2541
2575
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2545,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2545
2579
|
const int kbx = k / QI5_1;
|
2546
2580
|
const int kqsx = k % QI5_1;
|
2547
2581
|
|
2548
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2582
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2549
2583
|
|
2550
2584
|
#pragma unroll
|
2551
2585
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2597,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2597
2631
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2598
2632
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2599
2633
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2634
|
+
(void)x_qh; (void)x_sc;
|
2600
2635
|
|
2601
2636
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2602
2637
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2631,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2631
2666
|
}
|
2632
2667
|
|
2633
2668
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2669
|
+
(void)x_qh; (void)x_sc;
|
2634
2670
|
|
2635
2671
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2636
2672
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2642,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2642
2678
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2643
2679
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2644
2680
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2681
|
+
(void)x_qh; (void)x_sc;
|
2645
2682
|
|
2646
2683
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2647
2684
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2652,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2652
2689
|
const int kqsx = k % QI8_0;
|
2653
2690
|
float * x_dmf = (float *) x_dm;
|
2654
2691
|
|
2655
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2692
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2656
2693
|
|
2657
2694
|
#pragma unroll
|
2658
2695
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2687,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2687
2724
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2688
2725
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2689
2726
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2727
|
+
(void)x_qh; (void)x_sc;
|
2690
2728
|
|
2691
2729
|
const float * x_dmf = (const float *) x_dm;
|
2692
2730
|
const float * y_df = (const float *) y_ds;
|
@@ -2720,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2720
2758
|
}
|
2721
2759
|
|
2722
2760
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2761
|
+
(void)x_qh;
|
2723
2762
|
|
2724
2763
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2725
2764
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2733,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2733
2772
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2734
2773
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2735
2774
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2775
|
+
(void)x_qh;
|
2736
2776
|
|
2737
2777
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2738
2778
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2742,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2742
2782
|
const int kbx = k / QI2_K;
|
2743
2783
|
const int kqsx = k % QI2_K;
|
2744
2784
|
|
2745
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2785
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2746
2786
|
|
2747
2787
|
#pragma unroll
|
2748
2788
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2790,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2790
2830
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2791
2831
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2792
2832
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2833
|
+
(void)x_qh;
|
2793
2834
|
|
2794
2835
|
const int kbx = k / QI2_K;
|
2795
2836
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2863,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2863
2904
|
const int kbx = k / QI3_K;
|
2864
2905
|
const int kqsx = k % QI3_K;
|
2865
2906
|
|
2866
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2907
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2867
2908
|
|
2868
2909
|
#pragma unroll
|
2869
2910
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2944,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2944
2985
|
const float * x_dmf = (const float *) x_dm;
|
2945
2986
|
const float * y_df = (const float *) y_ds;
|
2946
2987
|
|
2947
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2988
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2948
2989
|
|
2949
2990
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2950
2991
|
|
@@ -3059,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3059
3100
|
}
|
3060
3101
|
|
3061
3102
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3103
|
+
(void)x_qh;
|
3062
3104
|
|
3063
3105
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3064
3106
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3072,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3072
3114
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3073
3115
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3074
3116
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3117
|
+
(void)x_qh;
|
3075
3118
|
|
3076
3119
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3077
3120
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3081,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3081
3124
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3082
3125
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3083
3126
|
|
3084
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3127
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3085
3128
|
|
3086
3129
|
#pragma unroll
|
3087
3130
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3126,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3126
3169
|
|
3127
3170
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3128
3171
|
|
3129
|
-
const int * scales = (int *) bxi->scales;
|
3172
|
+
const int * scales = (const int *) bxi->scales;
|
3130
3173
|
|
3131
3174
|
const int ksc = k % (WARP_SIZE/8);
|
3132
3175
|
|
@@ -3141,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3141
3184
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3142
3185
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3143
3186
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3187
|
+
(void)x_qh;
|
3144
3188
|
|
3145
3189
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3146
3190
|
|
@@ -3240,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3240
3284
|
}
|
3241
3285
|
|
3242
3286
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3287
|
+
(void)x_qh;
|
3243
3288
|
|
3244
3289
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3245
3290
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3253,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3253
3298
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3254
3299
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3255
3300
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3301
|
+
(void)x_qh;
|
3256
3302
|
|
3257
3303
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3258
3304
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3262,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3262
3308
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3263
3309
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3264
3310
|
|
3265
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3311
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3266
3312
|
|
3267
3313
|
#pragma unroll
|
3268
3314
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3318,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3318
3364
|
|
3319
3365
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3320
3366
|
|
3321
|
-
const int * scales = (int *) bxi->scales;
|
3367
|
+
const int * scales = (const int *) bxi->scales;
|
3322
3368
|
|
3323
3369
|
const int ksc = k % (WARP_SIZE/8);
|
3324
3370
|
|
@@ -3333,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3333
3379
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3334
3380
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3335
3381
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3382
|
+
(void)x_qh;
|
3336
3383
|
|
3337
3384
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3338
3385
|
|
@@ -3369,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3369
3416
|
}
|
3370
3417
|
|
3371
3418
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3419
|
+
(void)x_qh;
|
3372
3420
|
|
3373
3421
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3374
3422
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3382,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3382
3430
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3383
3431
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3384
3432
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3433
|
+
(void)x_qh;
|
3385
3434
|
|
3386
3435
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3387
3436
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3391,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3391
3440
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3392
3441
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3393
3442
|
|
3394
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3443
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3395
3444
|
|
3396
3445
|
#pragma unroll
|
3397
3446
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3453,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3453
3502
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3454
3503
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3455
3504
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3505
|
+
(void)x_qh;
|
3456
3506
|
|
3457
3507
|
const float * x_dmf = (const float *) x_dm;
|
3458
3508
|
const float * y_df = (const float *) y_ds;
|
@@ -3495,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3495
3545
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3496
3546
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3497
3547
|
|
3498
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3548
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3499
3549
|
|
3500
3550
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3501
3551
|
|
@@ -4468,6 +4518,13 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
|
4468
4518
|
*dsti = __float2half(*xi);
|
4469
4519
|
}
|
4470
4520
|
|
4521
|
+
static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
4522
|
+
const half * xi = (const half *) cxi;
|
4523
|
+
half * dsti = (half *) cdsti;
|
4524
|
+
|
4525
|
+
*dsti = *xi;
|
4526
|
+
}
|
4527
|
+
|
4471
4528
|
template <cpy_kernel_t cpy_1>
|
4472
4529
|
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
4473
4530
|
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
@@ -4721,6 +4778,25 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|
4721
4778
|
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4722
4779
|
}
|
4723
4780
|
|
4781
|
+
static __global__ void im2col_f32_f16(
|
4782
|
+
const float * x, half * dst,
|
4783
|
+
int ofs0, int ofs1, int IW, int IH, int CHW,
|
4784
|
+
int s0, int s1, int p0, int p1, int d0, int d1) {
|
4785
|
+
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
4786
|
+
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
4787
|
+
|
4788
|
+
const int offset_dst =
|
4789
|
+
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
4790
|
+
(blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
|
4791
|
+
|
4792
|
+
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
4793
|
+
dst[offset_dst] = __float2half(0.0f);
|
4794
|
+
} else {
|
4795
|
+
const int offset_src = threadIdx.x * ofs0 + blockIdx.x * ofs1;
|
4796
|
+
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
4797
|
+
}
|
4798
|
+
}
|
4799
|
+
|
4724
4800
|
template<int qk, int qr, dequantize_kernel_t dq>
|
4725
4801
|
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4726
4802
|
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
@@ -4759,6 +4835,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|
4759
4835
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4760
4836
|
}
|
4761
4837
|
|
4838
|
+
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4839
|
+
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
4840
|
+
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4841
|
+
}
|
4842
|
+
|
4843
|
+
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
4844
|
+
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
4845
|
+
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
4846
|
+
}
|
4847
|
+
|
4762
4848
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
4763
4849
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
4764
4850
|
if (ncols < 1024) {
|
@@ -5611,6 +5697,16 @@ static void ggml_cpy_f32_f16_cuda(
|
|
5611
5697
|
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5612
5698
|
}
|
5613
5699
|
|
5700
|
+
static void ggml_cpy_f16_f16_cuda(
|
5701
|
+
const char * cx, char * cdst, const int ne,
|
5702
|
+
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
5703
|
+
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
5704
|
+
|
5705
|
+
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
5706
|
+
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
5707
|
+
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
5708
|
+
}
|
5709
|
+
|
5614
5710
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
5615
5711
|
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
5616
5712
|
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
@@ -5694,6 +5790,15 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
|
|
5694
5790
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5695
5791
|
}
|
5696
5792
|
|
5793
|
+
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
5794
|
+
int OH, int IW, int IH, int OW, int IC,
|
5795
|
+
int KH, int KW, int N, int ofs0, int ofs1,
|
5796
|
+
int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
|
5797
|
+
dim3 block_nums(IC, OH, OW);
|
5798
|
+
dim3 block_dims(N, KH, KW);
|
5799
|
+
im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
5800
|
+
}
|
5801
|
+
|
5697
5802
|
// buffer pool for cuda
|
5698
5803
|
#define MAX_CUDA_BUFFERS 256
|
5699
5804
|
|
@@ -5762,7 +5867,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
5762
5867
|
return ptr;
|
5763
5868
|
}
|
5764
5869
|
#ifdef DEBUG_CUDA_MALLOC
|
5765
|
-
fprintf(stderr, "%s: %d buffers, max_size = %u
|
5870
|
+
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
5766
5871
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
5767
5872
|
#endif
|
5768
5873
|
void * ptr;
|
@@ -5900,7 +6005,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
|
5900
6005
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
5901
6006
|
// This can fixed the OOM error in WSL.
|
5902
6007
|
cudaGetLastError();
|
5903
|
-
fprintf(stderr, "WARNING: failed to allocate %.2f
|
6008
|
+
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
5904
6009
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
5905
6010
|
return nullptr;
|
5906
6011
|
}
|
@@ -5945,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5945
6050
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
5946
6051
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
5947
6052
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
5948
|
-
}
|
6053
|
+
}
|
6054
|
+
if (nb0 == ts) {
|
5949
6055
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
5950
|
-
} else {
|
5951
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
5952
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
5953
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
5954
|
-
// pretend the row is a matrix with cols=1
|
5955
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
5956
|
-
if (r != cudaSuccess) return r;
|
5957
|
-
}
|
5958
|
-
return cudaSuccess;
|
5959
6056
|
}
|
6057
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6058
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6059
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6060
|
+
// pretend the row is a matrix with cols=1
|
6061
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6062
|
+
if (r != cudaSuccess) { return r; }
|
6063
|
+
}
|
6064
|
+
return cudaSuccess;
|
5960
6065
|
}
|
5961
6066
|
|
5962
6067
|
static void ggml_cuda_op_repeat(
|
@@ -6128,6 +6233,34 @@ inline void ggml_cuda_op_silu(
|
|
6128
6233
|
(void) src1_dd;
|
6129
6234
|
}
|
6130
6235
|
|
6236
|
+
inline void ggml_cuda_op_relu(
|
6237
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6238
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6239
|
+
|
6240
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6241
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6242
|
+
|
6243
|
+
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6244
|
+
|
6245
|
+
(void) src1;
|
6246
|
+
(void) dst;
|
6247
|
+
(void) src1_dd;
|
6248
|
+
}
|
6249
|
+
|
6250
|
+
inline void ggml_cuda_op_sqr(
|
6251
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6252
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6253
|
+
|
6254
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6255
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6256
|
+
|
6257
|
+
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
6258
|
+
|
6259
|
+
(void) src1;
|
6260
|
+
(void) dst;
|
6261
|
+
(void) src1_dd;
|
6262
|
+
}
|
6263
|
+
|
6131
6264
|
inline void ggml_cuda_op_norm(
|
6132
6265
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6133
6266
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6250,6 +6383,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6250
6383
|
case GGML_TYPE_Q8_0:
|
6251
6384
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
6252
6385
|
case GGML_TYPE_F16:
|
6386
|
+
case GGML_TYPE_F32:
|
6253
6387
|
return 1;
|
6254
6388
|
case GGML_TYPE_Q2_K:
|
6255
6389
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
@@ -6272,6 +6406,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
6272
6406
|
case GGML_TYPE_Q8_0:
|
6273
6407
|
return 64;
|
6274
6408
|
case GGML_TYPE_F16:
|
6409
|
+
case GGML_TYPE_F32:
|
6275
6410
|
return 1;
|
6276
6411
|
case GGML_TYPE_Q2_K:
|
6277
6412
|
case GGML_TYPE_Q3_K:
|
@@ -6463,8 +6598,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|
6463
6598
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
6464
6599
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
6465
6600
|
}
|
6466
|
-
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *)
|
6467
|
-
|
6601
|
+
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
6468
6602
|
size_t dst_as = 0;
|
6469
6603
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
6470
6604
|
|
@@ -6639,6 +6773,45 @@ inline void ggml_cuda_op_alibi(
|
|
6639
6773
|
(void) src1_dd;
|
6640
6774
|
}
|
6641
6775
|
|
6776
|
+
inline void ggml_cuda_op_im2col(
|
6777
|
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6778
|
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6779
|
+
|
6780
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
6781
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6782
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
6783
|
+
|
6784
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
6785
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
6786
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
6787
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
6788
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
6789
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
6790
|
+
|
6791
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
6792
|
+
|
6793
|
+
const int64_t N = src1->ne[is_2D ? 3 : 2];
|
6794
|
+
const int64_t IC = src1->ne[is_2D ? 2 : 1];
|
6795
|
+
const int64_t IH = is_2D ? src1->ne[1] : 1;
|
6796
|
+
const int64_t IW = src1->ne[0];
|
6797
|
+
|
6798
|
+
const int64_t KH = is_2D ? src0->ne[1] : 1;
|
6799
|
+
const int64_t KW = src0->ne[0];
|
6800
|
+
|
6801
|
+
const int64_t OH = is_2D ? dst->ne[2] : 1;
|
6802
|
+
const int64_t OW = dst->ne[1];
|
6803
|
+
|
6804
|
+
const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
|
6805
|
+
const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
6806
|
+
|
6807
|
+
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
|
6808
|
+
OH, IW, IH, OW, IC, KH, KW, N,
|
6809
|
+
ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
|
6810
|
+
|
6811
|
+
(void) src0;
|
6812
|
+
(void) src0_dd;
|
6813
|
+
}
|
6814
|
+
|
6642
6815
|
inline void ggml_cuda_op_diag_mask_inf(
|
6643
6816
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6644
6817
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6843,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6843
7016
|
const int64_t ne01 = src0->ne[1];
|
6844
7017
|
const int64_t ne02 = src0->ne[2];
|
6845
7018
|
const int64_t ne03 = src0->ne[3];
|
6846
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7019
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6847
7020
|
|
6848
7021
|
const int64_t ne10 = src1->ne[0];
|
6849
7022
|
const int64_t ne11 = src1->ne[1];
|
@@ -6944,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6944
7117
|
if (src0_on_device && src0_is_contiguous) {
|
6945
7118
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
6946
7119
|
} else {
|
6947
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7120
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
6948
7121
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
6949
7122
|
}
|
6950
7123
|
|
@@ -7160,6 +7333,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|
7160
7333
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
7161
7334
|
}
|
7162
7335
|
|
7336
|
+
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7337
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
7338
|
+
}
|
7339
|
+
|
7340
|
+
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7341
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
7342
|
+
}
|
7343
|
+
|
7163
7344
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7164
7345
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
7165
7346
|
}
|
@@ -7169,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7169
7350
|
}
|
7170
7351
|
|
7171
7352
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7172
|
-
if (!g_cublas_loaded) return false;
|
7353
|
+
if (!g_cublas_loaded) { return false; }
|
7173
7354
|
|
7174
7355
|
const int64_t ne10 = src1->ne[0];
|
7175
7356
|
|
@@ -7247,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7247
7428
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7248
7429
|
}
|
7249
7430
|
|
7250
|
-
__global__ void k_compute_batched_ptrs(
|
7431
|
+
__global__ static void k_compute_batched_ptrs(
|
7251
7432
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7252
7433
|
const void ** ptrs_src, void ** ptrs_dst,
|
7253
7434
|
int ne12, int ne13,
|
@@ -7543,6 +7724,9 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
7543
7724
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
7544
7725
|
ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7545
7726
|
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7727
|
+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
7728
|
+
ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
|
7729
|
+
ne10, ne11, nb10, nb11, nb12, main_stream);
|
7546
7730
|
} else {
|
7547
7731
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
7548
7732
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
@@ -7574,6 +7758,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
7574
7758
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
7575
7759
|
}
|
7576
7760
|
|
7761
|
+
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7762
|
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
7763
|
+
}
|
7764
|
+
|
7577
7765
|
static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7578
7766
|
(void) src0;
|
7579
7767
|
(void) src1;
|
@@ -7685,11 +7873,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|
7685
7873
|
|
7686
7874
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7687
7875
|
if (g_temp_tensor_extras == nullptr) {
|
7688
|
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[
|
7876
|
+
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
7689
7877
|
}
|
7690
7878
|
|
7691
7879
|
size_t alloc_index = g_temp_tensor_extra_index;
|
7692
|
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) %
|
7880
|
+
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
7693
7881
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7694
7882
|
memset(extra, 0, sizeof(*extra));
|
7695
7883
|
|
@@ -7856,7 +8044,7 @@ void ggml_cuda_free_scratch() {
|
|
7856
8044
|
}
|
7857
8045
|
|
7858
8046
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7859
|
-
if (!g_cublas_loaded) return false;
|
8047
|
+
if (!g_cublas_loaded) { return false; }
|
7860
8048
|
|
7861
8049
|
ggml_cuda_func_t func;
|
7862
8050
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -7867,6 +8055,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7867
8055
|
return false;
|
7868
8056
|
}
|
7869
8057
|
|
8058
|
+
if (tensor->op == GGML_OP_MUL_MAT) {
|
8059
|
+
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8060
|
+
#ifndef NDEBUG
|
8061
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8062
|
+
#endif
|
8063
|
+
return false;
|
8064
|
+
}
|
8065
|
+
}
|
8066
|
+
|
7870
8067
|
switch (tensor->op) {
|
7871
8068
|
case GGML_OP_REPEAT:
|
7872
8069
|
func = ggml_cuda_repeat;
|
@@ -7891,6 +8088,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7891
8088
|
case GGML_UNARY_OP_SILU:
|
7892
8089
|
func = ggml_cuda_silu;
|
7893
8090
|
break;
|
8091
|
+
case GGML_UNARY_OP_RELU:
|
8092
|
+
func = ggml_cuda_relu;
|
8093
|
+
break;
|
7894
8094
|
default:
|
7895
8095
|
return false;
|
7896
8096
|
} break;
|
@@ -7909,6 +8109,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7909
8109
|
case GGML_OP_SCALE:
|
7910
8110
|
func = ggml_cuda_scale;
|
7911
8111
|
break;
|
8112
|
+
case GGML_OP_SQR:
|
8113
|
+
func = ggml_cuda_sqr;
|
8114
|
+
break;
|
7912
8115
|
case GGML_OP_CLAMP:
|
7913
8116
|
if (!any_on_device) {
|
7914
8117
|
return false;
|
@@ -7939,6 +8142,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7939
8142
|
case GGML_OP_ALIBI:
|
7940
8143
|
func = ggml_cuda_alibi;
|
7941
8144
|
break;
|
8145
|
+
case GGML_OP_IM2COL:
|
8146
|
+
func = ggml_cuda_im2col;
|
8147
|
+
break;
|
7942
8148
|
default:
|
7943
8149
|
return false;
|
7944
8150
|
}
|
@@ -7998,11 +8204,11 @@ struct ggml_backend_buffer_context_cuda {
|
|
7998
8204
|
|
7999
8205
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
8000
8206
|
if (temp_tensor_extras == nullptr) {
|
8001
|
-
temp_tensor_extras = new ggml_tensor_extra_gpu[
|
8207
|
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
8002
8208
|
}
|
8003
8209
|
|
8004
8210
|
size_t alloc_index = temp_tensor_extra_index;
|
8005
|
-
temp_tensor_extra_index = (temp_tensor_extra_index + 1) %
|
8211
|
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
8006
8212
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
8007
8213
|
memset(extra, 0, sizeof(*extra));
|
8008
8214
|
|
@@ -8088,7 +8294,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
|
|
8088
8294
|
ggml_cuda_set_device(g_main_device);
|
8089
8295
|
|
8090
8296
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
8297
|
+
|
8298
|
+
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
8299
|
+
|
8300
|
+
ggml_cuda_set_device(g_main_device);
|
8091
8301
|
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
8302
|
+
|
8092
8303
|
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
8093
8304
|
}
|
8094
8305
|
|
@@ -8132,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8132
8343
|
UNUSED(cgraph);
|
8133
8344
|
}
|
8134
8345
|
|
8135
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8346
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8136
8347
|
GGML_ASSERT(!"not implemented");
|
8137
8348
|
|
8138
8349
|
UNUSED(backend);
|
8139
8350
|
UNUSED(plan);
|
8140
8351
|
}
|
8141
8352
|
|
8142
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8353
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8143
8354
|
GGML_ASSERT(!"not implemented");
|
8144
8355
|
|
8145
8356
|
UNUSED(backend);
|
@@ -8155,6 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8155
8366
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8156
8367
|
ggml_tensor * node = cgraph->nodes[i];
|
8157
8368
|
|
8369
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8370
|
+
continue;
|
8371
|
+
}
|
8158
8372
|
assert(node->backend == GGML_BACKEND_GPU);
|
8159
8373
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8160
8374
|
if (node->src[j] != nullptr) {
|