llama_cpp 0.9.3 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
|
|
235
236
|
#endif //GGML_CUDA_F16
|
236
237
|
|
237
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
238
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
240
|
|
240
241
|
int x32 = 0;
|
241
242
|
x32 |= x16[0] << 0;
|
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
245
246
|
}
|
246
247
|
|
247
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
248
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
250
|
|
250
251
|
int x32 = 0;
|
251
252
|
x32 |= x16[0] << 0;
|
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
255
256
|
}
|
256
257
|
|
257
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
258
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
260
|
}
|
260
261
|
|
261
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
262
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
264
|
}
|
264
265
|
|
265
266
|
template<typename T>
|
@@ -442,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
442
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
443
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
444
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
445
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
446
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
447
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -469,7 +471,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
469
471
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
470
472
|
|
471
473
|
#define MAX_STREAMS 8
|
472
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
474
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
473
475
|
|
474
476
|
struct ggml_tensor_extra_gpu {
|
475
477
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -500,6 +502,31 @@ static size_t g_scratch_offset = 0;
|
|
500
502
|
|
501
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
502
504
|
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
|
+
}
|
510
|
+
return x;
|
511
|
+
}
|
512
|
+
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
521
|
+
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
526
|
+
}
|
527
|
+
return x;
|
528
|
+
}
|
529
|
+
|
503
530
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
504
531
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
505
532
|
|
@@ -576,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
576
603
|
dst[i] = x[i] * x[i];
|
577
604
|
}
|
578
605
|
|
579
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
580
|
-
#pragma unroll
|
581
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
582
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
583
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
584
|
-
}
|
585
|
-
return a;
|
586
|
-
}
|
587
|
-
|
588
606
|
template <int block_size>
|
589
607
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
590
608
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -623,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
623
641
|
}
|
624
642
|
}
|
625
643
|
|
626
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
627
|
-
#pragma unroll
|
628
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
629
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
630
|
-
}
|
631
|
-
return x;
|
632
|
-
}
|
633
|
-
|
634
644
|
template <int block_size>
|
635
645
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
636
646
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -2248,6 +2258,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2248
2258
|
}
|
2249
2259
|
|
2250
2260
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2261
|
+
(void)x_qh; (void)x_sc;
|
2251
2262
|
|
2252
2263
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2253
2264
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2259,7 +2270,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2259
2270
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2260
2271
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2261
2272
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2262
|
-
|
2273
|
+
(void)x_qh; (void)x_sc;
|
2263
2274
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2264
2275
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2265
2276
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2268,7 +2279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2268
2279
|
const int kbx = k / QI4_0;
|
2269
2280
|
const int kqsx = k % QI4_0;
|
2270
2281
|
|
2271
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2282
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2272
2283
|
|
2273
2284
|
float * x_dmf = (float *) x_dm;
|
2274
2285
|
|
@@ -2306,9 +2317,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2306
2317
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2307
2318
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2308
2319
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2320
|
+
(void)x_qh; (void)x_sc;
|
2309
2321
|
|
2310
2322
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2311
|
-
const float * x_dmf = (float *) x_dm;
|
2323
|
+
const float * x_dmf = (const float *) x_dm;
|
2312
2324
|
|
2313
2325
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2314
2326
|
|
@@ -2342,6 +2354,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2342
2354
|
}
|
2343
2355
|
|
2344
2356
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2357
|
+
(void)x_qh; (void)x_sc;
|
2345
2358
|
|
2346
2359
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2347
2360
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2353,6 +2366,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2353
2366
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2354
2367
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2355
2368
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2369
|
+
(void)x_qh; (void)x_sc;
|
2356
2370
|
|
2357
2371
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2358
2372
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2362,7 +2376,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2362
2376
|
const int kbx = k / QI4_1;
|
2363
2377
|
const int kqsx = k % QI4_1;
|
2364
2378
|
|
2365
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2379
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2366
2380
|
|
2367
2381
|
#pragma unroll
|
2368
2382
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2397,6 +2411,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2397
2411
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2398
2412
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2399
2413
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2414
|
+
(void)x_qh; (void)x_sc;
|
2400
2415
|
|
2401
2416
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2402
2417
|
|
@@ -2434,6 +2449,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2434
2449
|
}
|
2435
2450
|
|
2436
2451
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2452
|
+
(void)x_qh; (void)x_sc;
|
2437
2453
|
|
2438
2454
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2439
2455
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2445,6 +2461,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2445
2461
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2446
2462
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2447
2463
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2464
|
+
(void)x_qh; (void)x_sc;
|
2448
2465
|
|
2449
2466
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2450
2467
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2454,7 +2471,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2454
2471
|
const int kbx = k / QI5_0;
|
2455
2472
|
const int kqsx = k % QI5_0;
|
2456
2473
|
|
2457
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2474
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2458
2475
|
|
2459
2476
|
#pragma unroll
|
2460
2477
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2509,6 +2526,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2509
2526
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2510
2527
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2511
2528
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2529
|
+
(void)x_qh; (void)x_sc;
|
2512
2530
|
|
2513
2531
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2514
2532
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2548,6 +2566,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2548
2566
|
}
|
2549
2567
|
|
2550
2568
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2569
|
+
(void)x_qh; (void)x_sc;
|
2551
2570
|
|
2552
2571
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2553
2572
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2559,6 +2578,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2559
2578
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2560
2579
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2561
2580
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2581
|
+
(void)x_qh; (void)x_sc;
|
2562
2582
|
|
2563
2583
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2564
2584
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2568,7 +2588,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2568
2588
|
const int kbx = k / QI5_1;
|
2569
2589
|
const int kqsx = k % QI5_1;
|
2570
2590
|
|
2571
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2591
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2572
2592
|
|
2573
2593
|
#pragma unroll
|
2574
2594
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2620,6 +2640,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2620
2640
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2621
2641
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2622
2642
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2643
|
+
(void)x_qh; (void)x_sc;
|
2623
2644
|
|
2624
2645
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2625
2646
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2654,6 +2675,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2654
2675
|
}
|
2655
2676
|
|
2656
2677
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2678
|
+
(void)x_qh; (void)x_sc;
|
2657
2679
|
|
2658
2680
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2659
2681
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2665,6 +2687,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2665
2687
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2666
2688
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2667
2689
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2690
|
+
(void)x_qh; (void)x_sc;
|
2668
2691
|
|
2669
2692
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2670
2693
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2675,7 +2698,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2675
2698
|
const int kqsx = k % QI8_0;
|
2676
2699
|
float * x_dmf = (float *) x_dm;
|
2677
2700
|
|
2678
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2701
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2679
2702
|
|
2680
2703
|
#pragma unroll
|
2681
2704
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2710,6 +2733,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2710
2733
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2711
2734
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2712
2735
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2736
|
+
(void)x_qh; (void)x_sc;
|
2713
2737
|
|
2714
2738
|
const float * x_dmf = (const float *) x_dm;
|
2715
2739
|
const float * y_df = (const float *) y_ds;
|
@@ -2743,6 +2767,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2743
2767
|
}
|
2744
2768
|
|
2745
2769
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2770
|
+
(void)x_qh;
|
2746
2771
|
|
2747
2772
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2748
2773
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2756,6 +2781,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2756
2781
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2757
2782
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2758
2783
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
|
+
(void)x_qh;
|
2759
2785
|
|
2760
2786
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2761
2787
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2765,7 +2791,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2765
2791
|
const int kbx = k / QI2_K;
|
2766
2792
|
const int kqsx = k % QI2_K;
|
2767
2793
|
|
2768
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2794
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2769
2795
|
|
2770
2796
|
#pragma unroll
|
2771
2797
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2813,6 +2839,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2813
2839
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2814
2840
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2815
2841
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2842
|
+
(void)x_qh;
|
2816
2843
|
|
2817
2844
|
const int kbx = k / QI2_K;
|
2818
2845
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2886,7 +2913,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2886
2913
|
const int kbx = k / QI3_K;
|
2887
2914
|
const int kqsx = k % QI3_K;
|
2888
2915
|
|
2889
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2916
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2890
2917
|
|
2891
2918
|
#pragma unroll
|
2892
2919
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2967,7 +2994,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2967
2994
|
const float * x_dmf = (const float *) x_dm;
|
2968
2995
|
const float * y_df = (const float *) y_ds;
|
2969
2996
|
|
2970
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2997
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2971
2998
|
|
2972
2999
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2973
3000
|
|
@@ -3082,6 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3082
3109
|
}
|
3083
3110
|
|
3084
3111
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3112
|
+
(void)x_qh;
|
3085
3113
|
|
3086
3114
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3087
3115
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3095,6 +3123,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3095
3123
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3096
3124
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3097
3125
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3126
|
+
(void)x_qh;
|
3098
3127
|
|
3099
3128
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3100
3129
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3104,7 +3133,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3104
3133
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3105
3134
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3106
3135
|
|
3107
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3136
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3108
3137
|
|
3109
3138
|
#pragma unroll
|
3110
3139
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3149,7 +3178,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3149
3178
|
|
3150
3179
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3151
3180
|
|
3152
|
-
const int * scales = (int *) bxi->scales;
|
3181
|
+
const int * scales = (const int *) bxi->scales;
|
3153
3182
|
|
3154
3183
|
const int ksc = k % (WARP_SIZE/8);
|
3155
3184
|
|
@@ -3164,6 +3193,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3164
3193
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3165
3194
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3166
3195
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3196
|
+
(void)x_qh;
|
3167
3197
|
|
3168
3198
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3169
3199
|
|
@@ -3263,6 +3293,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3263
3293
|
}
|
3264
3294
|
|
3265
3295
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3296
|
+
(void)x_qh;
|
3266
3297
|
|
3267
3298
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3268
3299
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3276,6 +3307,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3276
3307
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3277
3308
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3278
3309
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3310
|
+
(void)x_qh;
|
3279
3311
|
|
3280
3312
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3281
3313
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3285,7 +3317,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3285
3317
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3286
3318
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3287
3319
|
|
3288
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3320
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3289
3321
|
|
3290
3322
|
#pragma unroll
|
3291
3323
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3341,7 +3373,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3341
3373
|
|
3342
3374
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3343
3375
|
|
3344
|
-
const int * scales = (int *) bxi->scales;
|
3376
|
+
const int * scales = (const int *) bxi->scales;
|
3345
3377
|
|
3346
3378
|
const int ksc = k % (WARP_SIZE/8);
|
3347
3379
|
|
@@ -3356,6 +3388,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3356
3388
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3357
3389
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3358
3390
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3391
|
+
(void)x_qh;
|
3359
3392
|
|
3360
3393
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3361
3394
|
|
@@ -3392,6 +3425,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3392
3425
|
}
|
3393
3426
|
|
3394
3427
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3428
|
+
(void)x_qh;
|
3395
3429
|
|
3396
3430
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3397
3431
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3405,6 +3439,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3405
3439
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3406
3440
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3407
3441
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3442
|
+
(void)x_qh;
|
3408
3443
|
|
3409
3444
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3410
3445
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3414,7 +3449,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3414
3449
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3415
3450
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3416
3451
|
|
3417
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3452
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3418
3453
|
|
3419
3454
|
#pragma unroll
|
3420
3455
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3476,6 +3511,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3476
3511
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3477
3512
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3478
3513
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3514
|
+
(void)x_qh;
|
3479
3515
|
|
3480
3516
|
const float * x_dmf = (const float *) x_dm;
|
3481
3517
|
const float * y_df = (const float *) y_ds;
|
@@ -3518,7 +3554,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3518
3554
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3519
3555
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3520
3556
|
|
3521
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3557
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3522
3558
|
|
3523
3559
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3524
3560
|
|
@@ -4583,8 +4619,8 @@ static __global__ void rope(
|
|
4583
4619
|
|
4584
4620
|
template<typename T, bool has_pos>
|
4585
4621
|
static __global__ void rope_neox(
|
4586
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4587
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4622
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4623
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4588
4624
|
) {
|
4589
4625
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4590
4626
|
|
@@ -4593,23 +4629,25 @@ static __global__ void rope_neox(
|
|
4593
4629
|
}
|
4594
4630
|
|
4595
4631
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4596
|
-
const int
|
4632
|
+
const int ib = col / n_dims;
|
4633
|
+
const int ic = col % n_dims;
|
4634
|
+
|
4635
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4597
4636
|
const int i2 = row/p_delta_rows;
|
4598
4637
|
|
4599
|
-
|
4600
|
-
const float cur_rot = -float(col)/ncols;
|
4638
|
+
float cur_rot = inv_ndims * ic - ib;
|
4601
4639
|
|
4602
4640
|
const int p = has_pos ? pos[i2] : 0;
|
4603
|
-
const float theta_base = p*powf(
|
4641
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4604
4642
|
|
4605
4643
|
float cos_theta, sin_theta;
|
4606
4644
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4607
4645
|
|
4608
4646
|
const float x0 = x[i + 0];
|
4609
|
-
const float x1 = x[i +
|
4647
|
+
const float x1 = x[i + n_dims/2];
|
4610
4648
|
|
4611
|
-
dst[i + 0]
|
4612
|
-
dst[i +
|
4649
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4650
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4613
4651
|
}
|
4614
4652
|
|
4615
4653
|
static __global__ void rope_glm_f32(
|
@@ -4688,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4688
4726
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4689
4727
|
}
|
4690
4728
|
|
4691
|
-
|
4692
|
-
|
4693
|
-
|
4694
|
-
const int
|
4695
|
-
|
4696
|
-
const int
|
4729
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4730
|
+
const int tid = threadIdx.x;
|
4731
|
+
const int rowx = blockIdx.x;
|
4732
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4733
|
+
|
4734
|
+
const int block_size = blockDim.x;
|
4735
|
+
|
4736
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4737
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4738
|
+
|
4739
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4697
4740
|
|
4698
4741
|
float max_val = -INFINITY;
|
4699
4742
|
|
4700
4743
|
for (int col = tid; col < ncols; col += block_size) {
|
4701
|
-
const int
|
4702
|
-
|
4744
|
+
const int ix = rowx*ncols + col;
|
4745
|
+
const int iy = rowy*ncols + col;
|
4746
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4703
4747
|
}
|
4704
4748
|
|
4705
4749
|
// find the max value in the block
|
4706
|
-
|
4707
|
-
|
4708
|
-
|
4750
|
+
max_val = warp_reduce_max(max_val);
|
4751
|
+
if (block_size > WARP_SIZE) {
|
4752
|
+
if (warp_id == 0) {
|
4753
|
+
buf[lane_id] = -INFINITY;
|
4754
|
+
}
|
4755
|
+
__syncthreads();
|
4756
|
+
|
4757
|
+
if (lane_id == 0) {
|
4758
|
+
buf[warp_id] = max_val;
|
4759
|
+
}
|
4760
|
+
__syncthreads();
|
4761
|
+
|
4762
|
+
max_val = buf[lane_id];
|
4763
|
+
max_val = warp_reduce_max(max_val);
|
4709
4764
|
}
|
4710
4765
|
|
4711
4766
|
float tmp = 0.f;
|
4712
4767
|
|
4713
4768
|
for (int col = tid; col < ncols; col += block_size) {
|
4714
|
-
const int
|
4715
|
-
const
|
4769
|
+
const int ix = rowx*ncols + col;
|
4770
|
+
const int iy = rowy*ncols + col;
|
4771
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4716
4772
|
tmp += val;
|
4717
|
-
dst[
|
4773
|
+
dst[ix] = val;
|
4718
4774
|
}
|
4719
4775
|
|
4720
|
-
// sum
|
4721
|
-
|
4722
|
-
|
4723
|
-
|
4776
|
+
// find the sum of exps in the block
|
4777
|
+
tmp = warp_reduce_sum(tmp);
|
4778
|
+
if (block_size > WARP_SIZE) {
|
4779
|
+
if (warp_id == 0) {
|
4780
|
+
buf[lane_id] = 0.f;
|
4781
|
+
}
|
4782
|
+
__syncthreads();
|
4783
|
+
|
4784
|
+
if (lane_id == 0) {
|
4785
|
+
buf[warp_id] = tmp;
|
4786
|
+
}
|
4787
|
+
__syncthreads();
|
4788
|
+
|
4789
|
+
tmp = buf[lane_id];
|
4790
|
+
tmp = warp_reduce_sum(tmp);
|
4724
4791
|
}
|
4725
4792
|
|
4726
4793
|
const float inv_tmp = 1.f / tmp;
|
4727
4794
|
|
4728
4795
|
for (int col = tid; col < ncols; col += block_size) {
|
4729
|
-
const int i =
|
4796
|
+
const int i = rowx*ncols + col;
|
4730
4797
|
dst[i] *= inv_tmp;
|
4731
4798
|
}
|
4732
4799
|
}
|
@@ -5712,20 +5779,26 @@ static void rope_cuda(
|
|
5712
5779
|
|
5713
5780
|
template<typename T>
|
5714
5781
|
static void rope_neox_cuda(
|
5715
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5782
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5716
5783
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5717
5784
|
) {
|
5718
5785
|
GGML_ASSERT(ncols % 2 == 0);
|
5719
5786
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5720
5787
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5721
5788
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5789
|
+
|
5790
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5791
|
+
const float inv_ndims = -1.0f / n_dims;
|
5792
|
+
|
5722
5793
|
if (pos == nullptr) {
|
5723
5794
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5724
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5795
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5796
|
+
theta_scale, inv_ndims
|
5725
5797
|
);
|
5726
5798
|
} else {
|
5727
5799
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5728
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5800
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5801
|
+
theta_scale, inv_ndims
|
5729
5802
|
);
|
5730
5803
|
}
|
5731
5804
|
}
|
@@ -5757,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5757
5830
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5758
5831
|
}
|
5759
5832
|
|
5760
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5761
|
-
|
5833
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
5834
|
+
int nth = WARP_SIZE;
|
5835
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
5836
|
+
const dim3 block_dims(nth, 1, 1);
|
5762
5837
|
const dim3 block_nums(nrows_x, 1, 1);
|
5763
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5838
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5764
5839
|
}
|
5765
5840
|
|
5766
5841
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -6023,18 +6098,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6023
6098
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6024
6099
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6025
6100
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6026
|
-
}
|
6101
|
+
}
|
6102
|
+
if (nb0 == ts) {
|
6027
6103
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6028
|
-
} else {
|
6029
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6030
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6031
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6032
|
-
// pretend the row is a matrix with cols=1
|
6033
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6034
|
-
if (r != cudaSuccess) return r;
|
6035
|
-
}
|
6036
|
-
return cudaSuccess;
|
6037
6104
|
}
|
6105
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6106
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6107
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6108
|
+
// pretend the row is a matrix with cols=1
|
6109
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6110
|
+
if (r != cudaSuccess) { return r; }
|
6111
|
+
}
|
6112
|
+
return cudaSuccess;
|
6038
6113
|
}
|
6039
6114
|
|
6040
6115
|
static void ggml_cuda_op_repeat(
|
@@ -6680,15 +6755,14 @@ inline void ggml_cuda_op_rope(
|
|
6680
6755
|
GGML_ASSERT(false);
|
6681
6756
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6682
6757
|
} else if (is_neox) {
|
6683
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6684
6758
|
if (src0->type == GGML_TYPE_F32) {
|
6685
6759
|
rope_neox_cuda(
|
6686
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6760
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6687
6761
|
attn_factor, corr_dims, main_stream
|
6688
6762
|
);
|
6689
6763
|
} else if (src0->type == GGML_TYPE_F16) {
|
6690
6764
|
rope_neox_cuda(
|
6691
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6765
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6692
6766
|
attn_factor, corr_dims, main_stream
|
6693
6767
|
);
|
6694
6768
|
} else {
|
@@ -6812,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6812
6886
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6813
6887
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6814
6888
|
|
6889
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
6890
|
+
|
6815
6891
|
const int64_t ne00 = src0->ne[0];
|
6816
|
-
const int64_t
|
6892
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
6893
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6817
6894
|
|
6818
|
-
|
6895
|
+
float scale = 1.0f;
|
6896
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
6897
|
+
|
6898
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6819
6899
|
|
6820
|
-
(void) src1;
|
6821
6900
|
(void) dst;
|
6822
|
-
(void) src1_dd;
|
6823
6901
|
}
|
6824
6902
|
|
6825
6903
|
inline void ggml_cuda_op_scale(
|
@@ -6989,7 +7067,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6989
7067
|
const int64_t ne01 = src0->ne[1];
|
6990
7068
|
const int64_t ne02 = src0->ne[2];
|
6991
7069
|
const int64_t ne03 = src0->ne[3];
|
6992
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7070
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6993
7071
|
|
6994
7072
|
const int64_t ne10 = src1->ne[0];
|
6995
7073
|
const int64_t ne11 = src1->ne[1];
|
@@ -7090,7 +7168,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7090
7168
|
if (src0_on_device && src0_is_contiguous) {
|
7091
7169
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
7092
7170
|
} else {
|
7093
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7171
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7094
7172
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
7095
7173
|
}
|
7096
7174
|
|
@@ -7323,7 +7401,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7323
7401
|
}
|
7324
7402
|
|
7325
7403
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
-
if (!g_cublas_loaded) return false;
|
7404
|
+
if (!g_cublas_loaded) { return false; }
|
7327
7405
|
|
7328
7406
|
const int64_t ne10 = src1->ne[0];
|
7329
7407
|
|
@@ -7401,7 +7479,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7401
7479
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7402
7480
|
}
|
7403
7481
|
|
7404
|
-
__global__ void k_compute_batched_ptrs(
|
7482
|
+
__global__ static void k_compute_batched_ptrs(
|
7405
7483
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7406
7484
|
const void ** ptrs_src, void ** ptrs_dst,
|
7407
7485
|
int ne12, int ne13,
|
@@ -8017,7 +8095,7 @@ void ggml_cuda_free_scratch() {
|
|
8017
8095
|
}
|
8018
8096
|
|
8019
8097
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
-
if (!g_cublas_loaded) return false;
|
8098
|
+
if (!g_cublas_loaded) { return false; }
|
8021
8099
|
|
8022
8100
|
ggml_cuda_func_t func;
|
8023
8101
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8031,7 +8109,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8031
8109
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
8110
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
8111
|
#ifndef NDEBUG
|
8034
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] =
|
8112
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
8113
|
#endif
|
8036
8114
|
return false;
|
8037
8115
|
}
|
@@ -8316,14 +8394,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8316
8394
|
UNUSED(cgraph);
|
8317
8395
|
}
|
8318
8396
|
|
8319
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8397
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8320
8398
|
GGML_ASSERT(!"not implemented");
|
8321
8399
|
|
8322
8400
|
UNUSED(backend);
|
8323
8401
|
UNUSED(plan);
|
8324
8402
|
}
|
8325
8403
|
|
8326
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8404
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8327
8405
|
GGML_ASSERT(!"not implemented");
|
8328
8406
|
|
8329
8407
|
UNUSED(backend);
|
@@ -8339,8 +8417,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8339
8417
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8340
8418
|
ggml_tensor * node = cgraph->nodes[i];
|
8341
8419
|
|
8342
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8420
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8343
8421
|
continue;
|
8422
|
+
}
|
8344
8423
|
assert(node->backend == GGML_BACKEND_GPU);
|
8345
8424
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8346
8425
|
if (node->src[j] != nullptr) {
|