llama_cpp 0.9.3 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
|
4
|
+
data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
|
7
|
+
data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1523 to b1555.
|
4
|
+
|
1
5
|
## [[0.9.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.2...v0.9.3)] - 2023-11-18
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1500 to b1523.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
|
|
235
236
|
#endif //GGML_CUDA_F16
|
236
237
|
|
237
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
238
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
240
|
|
240
241
|
int x32 = 0;
|
241
242
|
x32 |= x16[0] << 0;
|
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
245
246
|
}
|
246
247
|
|
247
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
248
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
250
|
|
250
251
|
int x32 = 0;
|
251
252
|
x32 |= x16[0] << 0;
|
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
255
256
|
}
|
256
257
|
|
257
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
258
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
260
|
}
|
260
261
|
|
261
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
262
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
264
|
}
|
264
265
|
|
265
266
|
template<typename T>
|
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
469
470
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
470
471
|
|
471
472
|
#define MAX_STREAMS 8
|
472
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
473
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
473
474
|
|
474
475
|
struct ggml_tensor_extra_gpu {
|
475
476
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2248
2249
|
}
|
2249
2250
|
|
2250
2251
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2252
|
+
(void)x_qh; (void)x_sc;
|
2251
2253
|
|
2252
2254
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2253
2255
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2259
2261
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2260
2262
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2261
2263
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2262
|
-
|
2264
|
+
(void)x_qh; (void)x_sc;
|
2263
2265
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2264
2266
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2265
2267
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2268
2270
|
const int kbx = k / QI4_0;
|
2269
2271
|
const int kqsx = k % QI4_0;
|
2270
2272
|
|
2271
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2273
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2272
2274
|
|
2273
2275
|
float * x_dmf = (float *) x_dm;
|
2274
2276
|
|
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2306
2308
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2307
2309
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2308
2310
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2311
|
+
(void)x_qh; (void)x_sc;
|
2309
2312
|
|
2310
2313
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2311
|
-
const float * x_dmf = (float *) x_dm;
|
2314
|
+
const float * x_dmf = (const float *) x_dm;
|
2312
2315
|
|
2313
2316
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2314
2317
|
|
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2342
2345
|
}
|
2343
2346
|
|
2344
2347
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2348
|
+
(void)x_qh; (void)x_sc;
|
2345
2349
|
|
2346
2350
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2347
2351
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2353
2357
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2354
2358
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2355
2359
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2360
|
+
(void)x_qh; (void)x_sc;
|
2356
2361
|
|
2357
2362
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2358
2363
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2362
2367
|
const int kbx = k / QI4_1;
|
2363
2368
|
const int kqsx = k % QI4_1;
|
2364
2369
|
|
2365
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2370
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2366
2371
|
|
2367
2372
|
#pragma unroll
|
2368
2373
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2397
2402
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2398
2403
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2399
2404
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2405
|
+
(void)x_qh; (void)x_sc;
|
2400
2406
|
|
2401
2407
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2402
2408
|
|
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2434
2440
|
}
|
2435
2441
|
|
2436
2442
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2443
|
+
(void)x_qh; (void)x_sc;
|
2437
2444
|
|
2438
2445
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2439
2446
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2445
2452
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2446
2453
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2447
2454
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2455
|
+
(void)x_qh; (void)x_sc;
|
2448
2456
|
|
2449
2457
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2450
2458
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2454
2462
|
const int kbx = k / QI5_0;
|
2455
2463
|
const int kqsx = k % QI5_0;
|
2456
2464
|
|
2457
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2465
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2458
2466
|
|
2459
2467
|
#pragma unroll
|
2460
2468
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2509
2517
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2510
2518
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2511
2519
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2520
|
+
(void)x_qh; (void)x_sc;
|
2512
2521
|
|
2513
2522
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2514
2523
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2548
2557
|
}
|
2549
2558
|
|
2550
2559
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2560
|
+
(void)x_qh; (void)x_sc;
|
2551
2561
|
|
2552
2562
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2553
2563
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2559
2569
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2560
2570
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2561
2571
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2572
|
+
(void)x_qh; (void)x_sc;
|
2562
2573
|
|
2563
2574
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2564
2575
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2568
2579
|
const int kbx = k / QI5_1;
|
2569
2580
|
const int kqsx = k % QI5_1;
|
2570
2581
|
|
2571
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2582
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2572
2583
|
|
2573
2584
|
#pragma unroll
|
2574
2585
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2620
2631
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2621
2632
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2622
2633
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2634
|
+
(void)x_qh; (void)x_sc;
|
2623
2635
|
|
2624
2636
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2625
2637
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2654
2666
|
}
|
2655
2667
|
|
2656
2668
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2669
|
+
(void)x_qh; (void)x_sc;
|
2657
2670
|
|
2658
2671
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2659
2672
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2665
2678
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2666
2679
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2667
2680
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2681
|
+
(void)x_qh; (void)x_sc;
|
2668
2682
|
|
2669
2683
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2670
2684
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2675
2689
|
const int kqsx = k % QI8_0;
|
2676
2690
|
float * x_dmf = (float *) x_dm;
|
2677
2691
|
|
2678
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2692
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2679
2693
|
|
2680
2694
|
#pragma unroll
|
2681
2695
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2710
2724
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2711
2725
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2712
2726
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2727
|
+
(void)x_qh; (void)x_sc;
|
2713
2728
|
|
2714
2729
|
const float * x_dmf = (const float *) x_dm;
|
2715
2730
|
const float * y_df = (const float *) y_ds;
|
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2743
2758
|
}
|
2744
2759
|
|
2745
2760
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2761
|
+
(void)x_qh;
|
2746
2762
|
|
2747
2763
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2748
2764
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2756
2772
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2757
2773
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2758
2774
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2775
|
+
(void)x_qh;
|
2759
2776
|
|
2760
2777
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2761
2778
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2765
2782
|
const int kbx = k / QI2_K;
|
2766
2783
|
const int kqsx = k % QI2_K;
|
2767
2784
|
|
2768
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2785
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2769
2786
|
|
2770
2787
|
#pragma unroll
|
2771
2788
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2813
2830
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2814
2831
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2815
2832
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2833
|
+
(void)x_qh;
|
2816
2834
|
|
2817
2835
|
const int kbx = k / QI2_K;
|
2818
2836
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2886
2904
|
const int kbx = k / QI3_K;
|
2887
2905
|
const int kqsx = k % QI3_K;
|
2888
2906
|
|
2889
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2907
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2890
2908
|
|
2891
2909
|
#pragma unroll
|
2892
2910
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2967
2985
|
const float * x_dmf = (const float *) x_dm;
|
2968
2986
|
const float * y_df = (const float *) y_ds;
|
2969
2987
|
|
2970
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2988
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2971
2989
|
|
2972
2990
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2973
2991
|
|
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3082
3100
|
}
|
3083
3101
|
|
3084
3102
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3103
|
+
(void)x_qh;
|
3085
3104
|
|
3086
3105
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3087
3106
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3095
3114
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3096
3115
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3097
3116
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3117
|
+
(void)x_qh;
|
3098
3118
|
|
3099
3119
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3100
3120
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3104
3124
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3105
3125
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3106
3126
|
|
3107
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3127
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3108
3128
|
|
3109
3129
|
#pragma unroll
|
3110
3130
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3149
3169
|
|
3150
3170
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3151
3171
|
|
3152
|
-
const int * scales = (int *) bxi->scales;
|
3172
|
+
const int * scales = (const int *) bxi->scales;
|
3153
3173
|
|
3154
3174
|
const int ksc = k % (WARP_SIZE/8);
|
3155
3175
|
|
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3164
3184
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3165
3185
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3166
3186
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3187
|
+
(void)x_qh;
|
3167
3188
|
|
3168
3189
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3169
3190
|
|
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3263
3284
|
}
|
3264
3285
|
|
3265
3286
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3287
|
+
(void)x_qh;
|
3266
3288
|
|
3267
3289
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3268
3290
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3276
3298
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3277
3299
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3278
3300
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3301
|
+
(void)x_qh;
|
3279
3302
|
|
3280
3303
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3281
3304
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3285
3308
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3286
3309
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3287
3310
|
|
3288
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3311
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3289
3312
|
|
3290
3313
|
#pragma unroll
|
3291
3314
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3341
3364
|
|
3342
3365
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3343
3366
|
|
3344
|
-
const int * scales = (int *) bxi->scales;
|
3367
|
+
const int * scales = (const int *) bxi->scales;
|
3345
3368
|
|
3346
3369
|
const int ksc = k % (WARP_SIZE/8);
|
3347
3370
|
|
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3356
3379
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3357
3380
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3358
3381
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3382
|
+
(void)x_qh;
|
3359
3383
|
|
3360
3384
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3361
3385
|
|
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3392
3416
|
}
|
3393
3417
|
|
3394
3418
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3419
|
+
(void)x_qh;
|
3395
3420
|
|
3396
3421
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3397
3422
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3405
3430
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3406
3431
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3407
3432
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3433
|
+
(void)x_qh;
|
3408
3434
|
|
3409
3435
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3410
3436
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3414
3440
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3415
3441
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3416
3442
|
|
3417
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3443
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3418
3444
|
|
3419
3445
|
#pragma unroll
|
3420
3446
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3476
3502
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3477
3503
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3478
3504
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3505
|
+
(void)x_qh;
|
3479
3506
|
|
3480
3507
|
const float * x_dmf = (const float *) x_dm;
|
3481
3508
|
const float * y_df = (const float *) y_ds;
|
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3518
3545
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3519
3546
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3520
3547
|
|
3521
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3548
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3522
3549
|
|
3523
3550
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3524
3551
|
|
@@ -6023,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6023
6050
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6024
6051
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6025
6052
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6026
|
-
}
|
6053
|
+
}
|
6054
|
+
if (nb0 == ts) {
|
6027
6055
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6028
|
-
} else {
|
6029
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6030
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6031
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6032
|
-
// pretend the row is a matrix with cols=1
|
6033
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6034
|
-
if (r != cudaSuccess) return r;
|
6035
|
-
}
|
6036
|
-
return cudaSuccess;
|
6037
6056
|
}
|
6057
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6058
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6059
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6060
|
+
// pretend the row is a matrix with cols=1
|
6061
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6062
|
+
if (r != cudaSuccess) { return r; }
|
6063
|
+
}
|
6064
|
+
return cudaSuccess;
|
6038
6065
|
}
|
6039
6066
|
|
6040
6067
|
static void ggml_cuda_op_repeat(
|
@@ -6989,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6989
7016
|
const int64_t ne01 = src0->ne[1];
|
6990
7017
|
const int64_t ne02 = src0->ne[2];
|
6991
7018
|
const int64_t ne03 = src0->ne[3];
|
6992
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7019
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6993
7020
|
|
6994
7021
|
const int64_t ne10 = src1->ne[0];
|
6995
7022
|
const int64_t ne11 = src1->ne[1];
|
@@ -7090,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7090
7117
|
if (src0_on_device && src0_is_contiguous) {
|
7091
7118
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
7092
7119
|
} else {
|
7093
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7120
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7094
7121
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
7095
7122
|
}
|
7096
7123
|
|
@@ -7323,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7323
7350
|
}
|
7324
7351
|
|
7325
7352
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
-
if (!g_cublas_loaded) return false;
|
7353
|
+
if (!g_cublas_loaded) { return false; }
|
7327
7354
|
|
7328
7355
|
const int64_t ne10 = src1->ne[0];
|
7329
7356
|
|
@@ -7401,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7401
7428
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7402
7429
|
}
|
7403
7430
|
|
7404
|
-
__global__ void k_compute_batched_ptrs(
|
7431
|
+
__global__ static void k_compute_batched_ptrs(
|
7405
7432
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7406
7433
|
const void ** ptrs_src, void ** ptrs_dst,
|
7407
7434
|
int ne12, int ne13,
|
@@ -8017,7 +8044,7 @@ void ggml_cuda_free_scratch() {
|
|
8017
8044
|
}
|
8018
8045
|
|
8019
8046
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
-
if (!g_cublas_loaded) return false;
|
8047
|
+
if (!g_cublas_loaded) { return false; }
|
8021
8048
|
|
8022
8049
|
ggml_cuda_func_t func;
|
8023
8050
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8031,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8031
8058
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
8059
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
8060
|
#ifndef NDEBUG
|
8034
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] =
|
8061
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
8062
|
#endif
|
8036
8063
|
return false;
|
8037
8064
|
}
|
@@ -8316,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8316
8343
|
UNUSED(cgraph);
|
8317
8344
|
}
|
8318
8345
|
|
8319
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8346
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8320
8347
|
GGML_ASSERT(!"not implemented");
|
8321
8348
|
|
8322
8349
|
UNUSED(backend);
|
8323
8350
|
UNUSED(plan);
|
8324
8351
|
}
|
8325
8352
|
|
8326
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8353
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8327
8354
|
GGML_ASSERT(!"not implemented");
|
8328
8355
|
|
8329
8356
|
UNUSED(backend);
|
@@ -8339,8 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8339
8366
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8340
8367
|
ggml_tensor * node = cgraph->nodes[i];
|
8341
8368
|
|
8342
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8369
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8343
8370
|
continue;
|
8371
|
+
}
|
8344
8372
|
assert(node->backend == GGML_BACKEND_GPU);
|
8345
8373
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8346
8374
|
if (node->src[j] != nullptr) {
|