llama_cpp 0.9.3 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
|
|
235
236
|
#endif //GGML_CUDA_F16
|
236
237
|
|
237
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
238
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
240
|
|
240
241
|
int x32 = 0;
|
241
242
|
x32 |= x16[0] << 0;
|
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
245
246
|
}
|
246
247
|
|
247
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
248
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
250
|
|
250
251
|
int x32 = 0;
|
251
252
|
x32 |= x16[0] << 0;
|
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
255
256
|
}
|
256
257
|
|
257
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
258
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
260
|
}
|
260
261
|
|
261
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
262
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
264
|
}
|
264
265
|
|
265
266
|
template<typename T>
|
@@ -442,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
442
443
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
443
444
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
444
445
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
446
|
+
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
445
447
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
446
448
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
447
449
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
@@ -469,7 +471,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
469
471
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
470
472
|
|
471
473
|
#define MAX_STREAMS 8
|
472
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
474
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
473
475
|
|
474
476
|
struct ggml_tensor_extra_gpu {
|
475
477
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -500,6 +502,31 @@ static size_t g_scratch_offset = 0;
|
|
500
502
|
|
501
503
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
502
504
|
|
505
|
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
506
|
+
#pragma unroll
|
507
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
508
|
+
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
509
|
+
}
|
510
|
+
return x;
|
511
|
+
}
|
512
|
+
|
513
|
+
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
514
|
+
#pragma unroll
|
515
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
516
|
+
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
517
|
+
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
518
|
+
}
|
519
|
+
return a;
|
520
|
+
}
|
521
|
+
|
522
|
+
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
523
|
+
#pragma unroll
|
524
|
+
for (int mask = 16; mask > 0; mask >>= 1) {
|
525
|
+
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
526
|
+
}
|
527
|
+
return x;
|
528
|
+
}
|
529
|
+
|
503
530
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
504
531
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
505
532
|
|
@@ -576,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
|
576
603
|
dst[i] = x[i] * x[i];
|
577
604
|
}
|
578
605
|
|
579
|
-
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
580
|
-
#pragma unroll
|
581
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
582
|
-
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
583
|
-
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
584
|
-
}
|
585
|
-
return a;
|
586
|
-
}
|
587
|
-
|
588
606
|
template <int block_size>
|
589
607
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
590
608
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -623,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
623
641
|
}
|
624
642
|
}
|
625
643
|
|
626
|
-
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
627
|
-
#pragma unroll
|
628
|
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
629
|
-
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
630
|
-
}
|
631
|
-
return x;
|
632
|
-
}
|
633
|
-
|
634
644
|
template <int block_size>
|
635
645
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
636
646
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
@@ -2248,6 +2258,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2248
2258
|
}
|
2249
2259
|
|
2250
2260
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2261
|
+
(void)x_qh; (void)x_sc;
|
2251
2262
|
|
2252
2263
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2253
2264
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2259,7 +2270,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2259
2270
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2260
2271
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2261
2272
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2262
|
-
|
2273
|
+
(void)x_qh; (void)x_sc;
|
2263
2274
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2264
2275
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2265
2276
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2268,7 +2279,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2268
2279
|
const int kbx = k / QI4_0;
|
2269
2280
|
const int kqsx = k % QI4_0;
|
2270
2281
|
|
2271
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2282
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2272
2283
|
|
2273
2284
|
float * x_dmf = (float *) x_dm;
|
2274
2285
|
|
@@ -2306,9 +2317,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2306
2317
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2307
2318
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2308
2319
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2320
|
+
(void)x_qh; (void)x_sc;
|
2309
2321
|
|
2310
2322
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2311
|
-
const float * x_dmf = (float *) x_dm;
|
2323
|
+
const float * x_dmf = (const float *) x_dm;
|
2312
2324
|
|
2313
2325
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2314
2326
|
|
@@ -2342,6 +2354,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2342
2354
|
}
|
2343
2355
|
|
2344
2356
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2357
|
+
(void)x_qh; (void)x_sc;
|
2345
2358
|
|
2346
2359
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2347
2360
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2353,6 +2366,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2353
2366
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2354
2367
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2355
2368
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2369
|
+
(void)x_qh; (void)x_sc;
|
2356
2370
|
|
2357
2371
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2358
2372
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2362,7 +2376,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2362
2376
|
const int kbx = k / QI4_1;
|
2363
2377
|
const int kqsx = k % QI4_1;
|
2364
2378
|
|
2365
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2379
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2366
2380
|
|
2367
2381
|
#pragma unroll
|
2368
2382
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2397,6 +2411,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2397
2411
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2398
2412
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2399
2413
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2414
|
+
(void)x_qh; (void)x_sc;
|
2400
2415
|
|
2401
2416
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2402
2417
|
|
@@ -2434,6 +2449,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2434
2449
|
}
|
2435
2450
|
|
2436
2451
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2452
|
+
(void)x_qh; (void)x_sc;
|
2437
2453
|
|
2438
2454
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2439
2455
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2445,6 +2461,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2445
2461
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2446
2462
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2447
2463
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2464
|
+
(void)x_qh; (void)x_sc;
|
2448
2465
|
|
2449
2466
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2450
2467
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2454,7 +2471,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2454
2471
|
const int kbx = k / QI5_0;
|
2455
2472
|
const int kqsx = k % QI5_0;
|
2456
2473
|
|
2457
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2474
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2458
2475
|
|
2459
2476
|
#pragma unroll
|
2460
2477
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2509,6 +2526,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2509
2526
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2510
2527
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2511
2528
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2529
|
+
(void)x_qh; (void)x_sc;
|
2512
2530
|
|
2513
2531
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2514
2532
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2548,6 +2566,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2548
2566
|
}
|
2549
2567
|
|
2550
2568
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2569
|
+
(void)x_qh; (void)x_sc;
|
2551
2570
|
|
2552
2571
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2553
2572
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2559,6 +2578,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2559
2578
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2560
2579
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2561
2580
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2581
|
+
(void)x_qh; (void)x_sc;
|
2562
2582
|
|
2563
2583
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2564
2584
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2568,7 +2588,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2568
2588
|
const int kbx = k / QI5_1;
|
2569
2589
|
const int kqsx = k % QI5_1;
|
2570
2590
|
|
2571
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2591
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2572
2592
|
|
2573
2593
|
#pragma unroll
|
2574
2594
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2620,6 +2640,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2620
2640
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2621
2641
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2622
2642
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2643
|
+
(void)x_qh; (void)x_sc;
|
2623
2644
|
|
2624
2645
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2625
2646
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2654,6 +2675,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2654
2675
|
}
|
2655
2676
|
|
2656
2677
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2678
|
+
(void)x_qh; (void)x_sc;
|
2657
2679
|
|
2658
2680
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2659
2681
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2665,6 +2687,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2665
2687
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2666
2688
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2667
2689
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2690
|
+
(void)x_qh; (void)x_sc;
|
2668
2691
|
|
2669
2692
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2670
2693
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2675,7 +2698,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2675
2698
|
const int kqsx = k % QI8_0;
|
2676
2699
|
float * x_dmf = (float *) x_dm;
|
2677
2700
|
|
2678
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2701
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2679
2702
|
|
2680
2703
|
#pragma unroll
|
2681
2704
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2710,6 +2733,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2710
2733
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2711
2734
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2712
2735
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2736
|
+
(void)x_qh; (void)x_sc;
|
2713
2737
|
|
2714
2738
|
const float * x_dmf = (const float *) x_dm;
|
2715
2739
|
const float * y_df = (const float *) y_ds;
|
@@ -2743,6 +2767,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2743
2767
|
}
|
2744
2768
|
|
2745
2769
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2770
|
+
(void)x_qh;
|
2746
2771
|
|
2747
2772
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2748
2773
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2756,6 +2781,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2756
2781
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2757
2782
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2758
2783
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2784
|
+
(void)x_qh;
|
2759
2785
|
|
2760
2786
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2761
2787
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2765,7 +2791,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2765
2791
|
const int kbx = k / QI2_K;
|
2766
2792
|
const int kqsx = k % QI2_K;
|
2767
2793
|
|
2768
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2794
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2769
2795
|
|
2770
2796
|
#pragma unroll
|
2771
2797
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2813,6 +2839,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2813
2839
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2814
2840
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2815
2841
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2842
|
+
(void)x_qh;
|
2816
2843
|
|
2817
2844
|
const int kbx = k / QI2_K;
|
2818
2845
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2886,7 +2913,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2886
2913
|
const int kbx = k / QI3_K;
|
2887
2914
|
const int kqsx = k % QI3_K;
|
2888
2915
|
|
2889
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2916
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2890
2917
|
|
2891
2918
|
#pragma unroll
|
2892
2919
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2967,7 +2994,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2967
2994
|
const float * x_dmf = (const float *) x_dm;
|
2968
2995
|
const float * y_df = (const float *) y_ds;
|
2969
2996
|
|
2970
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2997
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2971
2998
|
|
2972
2999
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2973
3000
|
|
@@ -3082,6 +3109,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3082
3109
|
}
|
3083
3110
|
|
3084
3111
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3112
|
+
(void)x_qh;
|
3085
3113
|
|
3086
3114
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3087
3115
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3095,6 +3123,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3095
3123
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3096
3124
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3097
3125
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3126
|
+
(void)x_qh;
|
3098
3127
|
|
3099
3128
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3100
3129
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3104,7 +3133,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3104
3133
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3105
3134
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3106
3135
|
|
3107
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3136
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3108
3137
|
|
3109
3138
|
#pragma unroll
|
3110
3139
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3149,7 +3178,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3149
3178
|
|
3150
3179
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3151
3180
|
|
3152
|
-
const int * scales = (int *) bxi->scales;
|
3181
|
+
const int * scales = (const int *) bxi->scales;
|
3153
3182
|
|
3154
3183
|
const int ksc = k % (WARP_SIZE/8);
|
3155
3184
|
|
@@ -3164,6 +3193,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3164
3193
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3165
3194
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3166
3195
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3196
|
+
(void)x_qh;
|
3167
3197
|
|
3168
3198
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3169
3199
|
|
@@ -3263,6 +3293,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3263
3293
|
}
|
3264
3294
|
|
3265
3295
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3296
|
+
(void)x_qh;
|
3266
3297
|
|
3267
3298
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3268
3299
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3276,6 +3307,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3276
3307
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3277
3308
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3278
3309
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3310
|
+
(void)x_qh;
|
3279
3311
|
|
3280
3312
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3281
3313
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3285,7 +3317,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3285
3317
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3286
3318
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3287
3319
|
|
3288
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3320
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3289
3321
|
|
3290
3322
|
#pragma unroll
|
3291
3323
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3341,7 +3373,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3341
3373
|
|
3342
3374
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3343
3375
|
|
3344
|
-
const int * scales = (int *) bxi->scales;
|
3376
|
+
const int * scales = (const int *) bxi->scales;
|
3345
3377
|
|
3346
3378
|
const int ksc = k % (WARP_SIZE/8);
|
3347
3379
|
|
@@ -3356,6 +3388,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3356
3388
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3357
3389
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3358
3390
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3391
|
+
(void)x_qh;
|
3359
3392
|
|
3360
3393
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3361
3394
|
|
@@ -3392,6 +3425,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3392
3425
|
}
|
3393
3426
|
|
3394
3427
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3428
|
+
(void)x_qh;
|
3395
3429
|
|
3396
3430
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3397
3431
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3405,6 +3439,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3405
3439
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3406
3440
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3407
3441
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3442
|
+
(void)x_qh;
|
3408
3443
|
|
3409
3444
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3410
3445
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3414,7 +3449,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3414
3449
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3415
3450
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3416
3451
|
|
3417
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3452
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3418
3453
|
|
3419
3454
|
#pragma unroll
|
3420
3455
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3476,6 +3511,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3476
3511
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3477
3512
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3478
3513
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3514
|
+
(void)x_qh;
|
3479
3515
|
|
3480
3516
|
const float * x_dmf = (const float *) x_dm;
|
3481
3517
|
const float * y_df = (const float *) y_ds;
|
@@ -3518,7 +3554,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3518
3554
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3519
3555
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3520
3556
|
|
3521
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3557
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3522
3558
|
|
3523
3559
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3524
3560
|
|
@@ -4583,8 +4619,8 @@ static __global__ void rope(
|
|
4583
4619
|
|
4584
4620
|
template<typename T, bool has_pos>
|
4585
4621
|
static __global__ void rope_neox(
|
4586
|
-
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4587
|
-
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
4622
|
+
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
4623
|
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
4588
4624
|
) {
|
4589
4625
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
4590
4626
|
|
@@ -4593,23 +4629,25 @@ static __global__ void rope_neox(
|
|
4593
4629
|
}
|
4594
4630
|
|
4595
4631
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
4596
|
-
const int
|
4632
|
+
const int ib = col / n_dims;
|
4633
|
+
const int ic = col % n_dims;
|
4634
|
+
|
4635
|
+
const int i = row*ncols + ib*n_dims + ic/2;
|
4597
4636
|
const int i2 = row/p_delta_rows;
|
4598
4637
|
|
4599
|
-
|
4600
|
-
const float cur_rot = -float(col)/ncols;
|
4638
|
+
float cur_rot = inv_ndims * ic - ib;
|
4601
4639
|
|
4602
4640
|
const int p = has_pos ? pos[i2] : 0;
|
4603
|
-
const float theta_base = p*powf(
|
4641
|
+
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
4604
4642
|
|
4605
4643
|
float cos_theta, sin_theta;
|
4606
4644
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
4607
4645
|
|
4608
4646
|
const float x0 = x[i + 0];
|
4609
|
-
const float x1 = x[i +
|
4647
|
+
const float x1 = x[i + n_dims/2];
|
4610
4648
|
|
4611
|
-
dst[i + 0]
|
4612
|
-
dst[i +
|
4649
|
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
4650
|
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
4613
4651
|
}
|
4614
4652
|
|
4615
4653
|
static __global__ void rope_glm_f32(
|
@@ -4688,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|
4688
4726
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
4689
4727
|
}
|
4690
4728
|
|
4691
|
-
|
4692
|
-
|
4693
|
-
|
4694
|
-
const int
|
4695
|
-
|
4696
|
-
const int
|
4729
|
+
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
4730
|
+
const int tid = threadIdx.x;
|
4731
|
+
const int rowx = blockIdx.x;
|
4732
|
+
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
4733
|
+
|
4734
|
+
const int block_size = blockDim.x;
|
4735
|
+
|
4736
|
+
const int warp_id = threadIdx.x / WARP_SIZE;
|
4737
|
+
const int lane_id = threadIdx.x % WARP_SIZE;
|
4738
|
+
|
4739
|
+
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
4697
4740
|
|
4698
4741
|
float max_val = -INFINITY;
|
4699
4742
|
|
4700
4743
|
for (int col = tid; col < ncols; col += block_size) {
|
4701
|
-
const int
|
4702
|
-
|
4744
|
+
const int ix = rowx*ncols + col;
|
4745
|
+
const int iy = rowy*ncols + col;
|
4746
|
+
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
4703
4747
|
}
|
4704
4748
|
|
4705
4749
|
// find the max value in the block
|
4706
|
-
|
4707
|
-
|
4708
|
-
|
4750
|
+
max_val = warp_reduce_max(max_val);
|
4751
|
+
if (block_size > WARP_SIZE) {
|
4752
|
+
if (warp_id == 0) {
|
4753
|
+
buf[lane_id] = -INFINITY;
|
4754
|
+
}
|
4755
|
+
__syncthreads();
|
4756
|
+
|
4757
|
+
if (lane_id == 0) {
|
4758
|
+
buf[warp_id] = max_val;
|
4759
|
+
}
|
4760
|
+
__syncthreads();
|
4761
|
+
|
4762
|
+
max_val = buf[lane_id];
|
4763
|
+
max_val = warp_reduce_max(max_val);
|
4709
4764
|
}
|
4710
4765
|
|
4711
4766
|
float tmp = 0.f;
|
4712
4767
|
|
4713
4768
|
for (int col = tid; col < ncols; col += block_size) {
|
4714
|
-
const int
|
4715
|
-
const
|
4769
|
+
const int ix = rowx*ncols + col;
|
4770
|
+
const int iy = rowy*ncols + col;
|
4771
|
+
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
4716
4772
|
tmp += val;
|
4717
|
-
dst[
|
4773
|
+
dst[ix] = val;
|
4718
4774
|
}
|
4719
4775
|
|
4720
|
-
// sum
|
4721
|
-
|
4722
|
-
|
4723
|
-
|
4776
|
+
// find the sum of exps in the block
|
4777
|
+
tmp = warp_reduce_sum(tmp);
|
4778
|
+
if (block_size > WARP_SIZE) {
|
4779
|
+
if (warp_id == 0) {
|
4780
|
+
buf[lane_id] = 0.f;
|
4781
|
+
}
|
4782
|
+
__syncthreads();
|
4783
|
+
|
4784
|
+
if (lane_id == 0) {
|
4785
|
+
buf[warp_id] = tmp;
|
4786
|
+
}
|
4787
|
+
__syncthreads();
|
4788
|
+
|
4789
|
+
tmp = buf[lane_id];
|
4790
|
+
tmp = warp_reduce_sum(tmp);
|
4724
4791
|
}
|
4725
4792
|
|
4726
4793
|
const float inv_tmp = 1.f / tmp;
|
4727
4794
|
|
4728
4795
|
for (int col = tid; col < ncols; col += block_size) {
|
4729
|
-
const int i =
|
4796
|
+
const int i = rowx*ncols + col;
|
4730
4797
|
dst[i] *= inv_tmp;
|
4731
4798
|
}
|
4732
4799
|
}
|
@@ -5712,20 +5779,26 @@ static void rope_cuda(
|
|
5712
5779
|
|
5713
5780
|
template<typename T>
|
5714
5781
|
static void rope_neox_cuda(
|
5715
|
-
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5782
|
+
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
5716
5783
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
5717
5784
|
) {
|
5718
5785
|
GGML_ASSERT(ncols % 2 == 0);
|
5719
5786
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
5720
5787
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
5721
5788
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
5789
|
+
|
5790
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
5791
|
+
const float inv_ndims = -1.0f / n_dims;
|
5792
|
+
|
5722
5793
|
if (pos == nullptr) {
|
5723
5794
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
5724
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5795
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5796
|
+
theta_scale, inv_ndims
|
5725
5797
|
);
|
5726
5798
|
} else {
|
5727
5799
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
5728
|
-
x, dst, ncols, pos, freq_scale, p_delta_rows,
|
5800
|
+
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
5801
|
+
theta_scale, inv_ndims
|
5729
5802
|
);
|
5730
5803
|
}
|
5731
5804
|
}
|
@@ -5757,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|
5757
5830
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
5758
5831
|
}
|
5759
5832
|
|
5760
|
-
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
5761
|
-
|
5833
|
+
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
5834
|
+
int nth = WARP_SIZE;
|
5835
|
+
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
5836
|
+
const dim3 block_dims(nth, 1, 1);
|
5762
5837
|
const dim3 block_nums(nrows_x, 1, 1);
|
5763
|
-
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
5838
|
+
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
5764
5839
|
}
|
5765
5840
|
|
5766
5841
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
@@ -6023,18 +6098,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6023
6098
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6024
6099
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6025
6100
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6026
|
-
}
|
6101
|
+
}
|
6102
|
+
if (nb0 == ts) {
|
6027
6103
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6028
|
-
} else {
|
6029
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6030
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6031
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6032
|
-
// pretend the row is a matrix with cols=1
|
6033
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6034
|
-
if (r != cudaSuccess) return r;
|
6035
|
-
}
|
6036
|
-
return cudaSuccess;
|
6037
6104
|
}
|
6105
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6106
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6107
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6108
|
+
// pretend the row is a matrix with cols=1
|
6109
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6110
|
+
if (r != cudaSuccess) { return r; }
|
6111
|
+
}
|
6112
|
+
return cudaSuccess;
|
6038
6113
|
}
|
6039
6114
|
|
6040
6115
|
static void ggml_cuda_op_repeat(
|
@@ -6680,15 +6755,14 @@ inline void ggml_cuda_op_rope(
|
|
6680
6755
|
GGML_ASSERT(false);
|
6681
6756
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
6682
6757
|
} else if (is_neox) {
|
6683
|
-
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
6684
6758
|
if (src0->type == GGML_TYPE_F32) {
|
6685
6759
|
rope_neox_cuda(
|
6686
|
-
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6760
|
+
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6687
6761
|
attn_factor, corr_dims, main_stream
|
6688
6762
|
);
|
6689
6763
|
} else if (src0->type == GGML_TYPE_F16) {
|
6690
6764
|
rope_neox_cuda(
|
6691
|
-
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6765
|
+
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
6692
6766
|
attn_factor, corr_dims, main_stream
|
6693
6767
|
);
|
6694
6768
|
} else {
|
@@ -6812,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
|
|
6812
6886
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6813
6887
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6814
6888
|
|
6889
|
+
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
6890
|
+
|
6815
6891
|
const int64_t ne00 = src0->ne[0];
|
6816
|
-
const int64_t
|
6892
|
+
const int64_t nrows_x = ggml_nrows(src0);
|
6893
|
+
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
6817
6894
|
|
6818
|
-
|
6895
|
+
float scale = 1.0f;
|
6896
|
+
memcpy(&scale, dst->op_params, sizeof(float));
|
6897
|
+
|
6898
|
+
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
6819
6899
|
|
6820
|
-
(void) src1;
|
6821
6900
|
(void) dst;
|
6822
|
-
(void) src1_dd;
|
6823
6901
|
}
|
6824
6902
|
|
6825
6903
|
inline void ggml_cuda_op_scale(
|
@@ -6989,7 +7067,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6989
7067
|
const int64_t ne01 = src0->ne[1];
|
6990
7068
|
const int64_t ne02 = src0->ne[2];
|
6991
7069
|
const int64_t ne03 = src0->ne[3];
|
6992
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7070
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6993
7071
|
|
6994
7072
|
const int64_t ne10 = src1->ne[0];
|
6995
7073
|
const int64_t ne11 = src1->ne[1];
|
@@ -7090,7 +7168,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7090
7168
|
if (src0_on_device && src0_is_contiguous) {
|
7091
7169
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
7092
7170
|
} else {
|
7093
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7171
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7094
7172
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
7095
7173
|
}
|
7096
7174
|
|
@@ -7323,7 +7401,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7323
7401
|
}
|
7324
7402
|
|
7325
7403
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
-
if (!g_cublas_loaded) return false;
|
7404
|
+
if (!g_cublas_loaded) { return false; }
|
7327
7405
|
|
7328
7406
|
const int64_t ne10 = src1->ne[0];
|
7329
7407
|
|
@@ -7401,7 +7479,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7401
7479
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7402
7480
|
}
|
7403
7481
|
|
7404
|
-
__global__ void k_compute_batched_ptrs(
|
7482
|
+
__global__ static void k_compute_batched_ptrs(
|
7405
7483
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7406
7484
|
const void ** ptrs_src, void ** ptrs_dst,
|
7407
7485
|
int ne12, int ne13,
|
@@ -8017,7 +8095,7 @@ void ggml_cuda_free_scratch() {
|
|
8017
8095
|
}
|
8018
8096
|
|
8019
8097
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
-
if (!g_cublas_loaded) return false;
|
8098
|
+
if (!g_cublas_loaded) { return false; }
|
8021
8099
|
|
8022
8100
|
ggml_cuda_func_t func;
|
8023
8101
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8031,7 +8109,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8031
8109
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
8110
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
8111
|
#ifndef NDEBUG
|
8034
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] =
|
8112
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
8113
|
#endif
|
8036
8114
|
return false;
|
8037
8115
|
}
|
@@ -8316,14 +8394,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8316
8394
|
UNUSED(cgraph);
|
8317
8395
|
}
|
8318
8396
|
|
8319
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8397
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8320
8398
|
GGML_ASSERT(!"not implemented");
|
8321
8399
|
|
8322
8400
|
UNUSED(backend);
|
8323
8401
|
UNUSED(plan);
|
8324
8402
|
}
|
8325
8403
|
|
8326
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8404
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8327
8405
|
GGML_ASSERT(!"not implemented");
|
8328
8406
|
|
8329
8407
|
UNUSED(backend);
|
@@ -8339,8 +8417,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8339
8417
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8340
8418
|
ggml_tensor * node = cgraph->nodes[i];
|
8341
8419
|
|
8342
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8420
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8343
8421
|
continue;
|
8422
|
+
}
|
8344
8423
|
assert(node->backend == GGML_BACKEND_GPU);
|
8345
8424
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8346
8425
|
if (node->src[j] != nullptr) {
|