llama_cpp 0.9.3 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fe656f26d7680d1b96c6949d40f4f615209c1c752b45ef145ac0f68b4af1d26
|
4
|
+
data.tar.gz: fb4d3c5b54a854edeeaf070b5497ba6656a5cff59b6b911b638551462004efb3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6dc8bc34fcb2635e5fa99c31f134dca12af4c48a0c3f1effbbf209e6e3156f1f95bf133ed33c2eabc6e9f7988d668dcbdb0545a3807b38969680618ba8774848
|
7
|
+
data.tar.gz: 591d9ed44ed3b3a40424d3903659ad868afff727a2cfaffefd6222ba54f8a51fbfbab109ceea22a9a6bd3ca4661fb3947ca8f3f179ac2d0ad8cf8ba917b30ffe
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## [[0.9.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.3...v0.9.4)] - 2023-11-25
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1523 to b1555.
|
4
|
+
|
1
5
|
## [[0.9.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.9.2...v0.9.3)] - 2023-11-18
|
2
6
|
|
3
7
|
- Bump bundled llama.cpp from b1500 to b1523.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <algorithm>
|
2
|
+
#include <cinttypes>
|
2
3
|
#include <cstddef>
|
3
4
|
#include <cstdint>
|
4
5
|
#include <limits>
|
@@ -235,7 +236,7 @@ typedef float2 dfloat2;
|
|
235
236
|
#endif //GGML_CUDA_F16
|
236
237
|
|
237
238
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
238
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
239
240
|
|
240
241
|
int x32 = 0;
|
241
242
|
x32 |= x16[0] << 0;
|
@@ -245,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
|
245
246
|
}
|
246
247
|
|
247
248
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
248
|
-
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
|
+
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
249
250
|
|
250
251
|
int x32 = 0;
|
251
252
|
x32 |= x16[0] << 0;
|
@@ -255,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
|
255
256
|
}
|
256
257
|
|
257
258
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
258
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
259
260
|
}
|
260
261
|
|
261
262
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
262
|
-
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
|
+
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
263
264
|
}
|
264
265
|
|
265
266
|
template<typename T>
|
@@ -469,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|
469
470
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
470
471
|
|
471
472
|
#define MAX_STREAMS 8
|
472
|
-
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
473
|
+
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
473
474
|
|
474
475
|
struct ggml_tensor_extra_gpu {
|
475
476
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
@@ -2248,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
|
2248
2249
|
}
|
2249
2250
|
|
2250
2251
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2252
|
+
(void)x_qh; (void)x_sc;
|
2251
2253
|
|
2252
2254
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2253
2255
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
@@ -2259,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
|
2259
2261
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
2260
2262
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2261
2263
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2262
|
-
|
2264
|
+
(void)x_qh; (void)x_sc;
|
2263
2265
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2264
2266
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
2265
2267
|
GGML_CUDA_ASSUME(k >= 0);
|
@@ -2268,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2268
2270
|
const int kbx = k / QI4_0;
|
2269
2271
|
const int kqsx = k % QI4_0;
|
2270
2272
|
|
2271
|
-
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
2273
|
+
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
2272
2274
|
|
2273
2275
|
float * x_dmf = (float *) x_dm;
|
2274
2276
|
|
@@ -2306,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2306
2308
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
2307
2309
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2308
2310
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2311
|
+
(void)x_qh; (void)x_sc;
|
2309
2312
|
|
2310
2313
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2311
|
-
const float * x_dmf = (float *) x_dm;
|
2314
|
+
const float * x_dmf = (const float *) x_dm;
|
2312
2315
|
|
2313
2316
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
2314
2317
|
|
@@ -2342,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
|
2342
2345
|
}
|
2343
2346
|
|
2344
2347
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2348
|
+
(void)x_qh; (void)x_sc;
|
2345
2349
|
|
2346
2350
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
2347
2351
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
@@ -2353,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
|
2353
2357
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
2354
2358
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2355
2359
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2360
|
+
(void)x_qh; (void)x_sc;
|
2356
2361
|
|
2357
2362
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2358
2363
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2362,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2362
2367
|
const int kbx = k / QI4_1;
|
2363
2368
|
const int kqsx = k % QI4_1;
|
2364
2369
|
|
2365
|
-
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
2370
|
+
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
2366
2371
|
|
2367
2372
|
#pragma unroll
|
2368
2373
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2397,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2397
2402
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
2398
2403
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2399
2404
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2405
|
+
(void)x_qh; (void)x_sc;
|
2400
2406
|
|
2401
2407
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2402
2408
|
|
@@ -2434,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
|
2434
2440
|
}
|
2435
2441
|
|
2436
2442
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2443
|
+
(void)x_qh; (void)x_sc;
|
2437
2444
|
|
2438
2445
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2439
2446
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
@@ -2445,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
|
2445
2452
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
2446
2453
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2447
2454
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2455
|
+
(void)x_qh; (void)x_sc;
|
2448
2456
|
|
2449
2457
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2450
2458
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2454,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2454
2462
|
const int kbx = k / QI5_0;
|
2455
2463
|
const int kqsx = k % QI5_0;
|
2456
2464
|
|
2457
|
-
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
2465
|
+
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
2458
2466
|
|
2459
2467
|
#pragma unroll
|
2460
2468
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2509,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2509
2517
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
2510
2518
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2511
2519
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2520
|
+
(void)x_qh; (void)x_sc;
|
2512
2521
|
|
2513
2522
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2514
2523
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
@@ -2548,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
|
2548
2557
|
}
|
2549
2558
|
|
2550
2559
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2560
|
+
(void)x_qh; (void)x_sc;
|
2551
2561
|
|
2552
2562
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
2553
2563
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
@@ -2559,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
|
2559
2569
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
2560
2570
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2561
2571
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2572
|
+
(void)x_qh; (void)x_sc;
|
2562
2573
|
|
2563
2574
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2564
2575
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2568,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2568
2579
|
const int kbx = k / QI5_1;
|
2569
2580
|
const int kqsx = k % QI5_1;
|
2570
2581
|
|
2571
|
-
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
2582
|
+
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
2572
2583
|
|
2573
2584
|
#pragma unroll
|
2574
2585
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2620,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2620
2631
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
2621
2632
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2622
2633
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2634
|
+
(void)x_qh; (void)x_sc;
|
2623
2635
|
|
2624
2636
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
2625
2637
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
@@ -2654,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|
2654
2666
|
}
|
2655
2667
|
|
2656
2668
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2669
|
+
(void)x_qh; (void)x_sc;
|
2657
2670
|
|
2658
2671
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
2659
2672
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
@@ -2665,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
|
2665
2678
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
2666
2679
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2667
2680
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2681
|
+
(void)x_qh; (void)x_sc;
|
2668
2682
|
|
2669
2683
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2670
2684
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2675,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2675
2689
|
const int kqsx = k % QI8_0;
|
2676
2690
|
float * x_dmf = (float *) x_dm;
|
2677
2691
|
|
2678
|
-
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
2692
|
+
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
2679
2693
|
|
2680
2694
|
#pragma unroll
|
2681
2695
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2710,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2710
2724
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
2711
2725
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2712
2726
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2727
|
+
(void)x_qh; (void)x_sc;
|
2713
2728
|
|
2714
2729
|
const float * x_dmf = (const float *) x_dm;
|
2715
2730
|
const float * y_df = (const float *) y_ds;
|
@@ -2743,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
2743
2758
|
}
|
2744
2759
|
|
2745
2760
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
2761
|
+
(void)x_qh;
|
2746
2762
|
|
2747
2763
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
2748
2764
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
@@ -2756,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
|
2756
2772
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
2757
2773
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
2758
2774
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
2775
|
+
(void)x_qh;
|
2759
2776
|
|
2760
2777
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
2761
2778
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -2765,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2765
2782
|
const int kbx = k / QI2_K;
|
2766
2783
|
const int kqsx = k % QI2_K;
|
2767
2784
|
|
2768
|
-
const block_q2_K * bx0 = (block_q2_K *) vx;
|
2785
|
+
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
2769
2786
|
|
2770
2787
|
#pragma unroll
|
2771
2788
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2813,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2813
2830
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
2814
2831
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
2815
2832
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
2833
|
+
(void)x_qh;
|
2816
2834
|
|
2817
2835
|
const int kbx = k / QI2_K;
|
2818
2836
|
const int ky = (k % QI2_K) * QR2_K;
|
@@ -2886,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
2886
2904
|
const int kbx = k / QI3_K;
|
2887
2905
|
const int kqsx = k % QI3_K;
|
2888
2906
|
|
2889
|
-
const block_q3_K * bx0 = (block_q3_K *) vx;
|
2907
|
+
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
2890
2908
|
|
2891
2909
|
#pragma unroll
|
2892
2910
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -2967,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
|
2967
2985
|
const float * x_dmf = (const float *) x_dm;
|
2968
2986
|
const float * y_df = (const float *) y_ds;
|
2969
2987
|
|
2970
|
-
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2988
|
+
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
2971
2989
|
|
2972
2990
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
2973
2991
|
|
@@ -3082,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
3082
3100
|
}
|
3083
3101
|
|
3084
3102
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3103
|
+
(void)x_qh;
|
3085
3104
|
|
3086
3105
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
3087
3106
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
@@ -3095,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
|
3095
3114
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
3096
3115
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3097
3116
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3117
|
+
(void)x_qh;
|
3098
3118
|
|
3099
3119
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3100
3120
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3104,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3104
3124
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
3105
3125
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
3106
3126
|
|
3107
|
-
const block_q4_K * bx0 = (block_q4_K *) vx;
|
3127
|
+
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
3108
3128
|
|
3109
3129
|
#pragma unroll
|
3110
3130
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3149,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3149
3169
|
|
3150
3170
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
3151
3171
|
|
3152
|
-
const int * scales = (int *) bxi->scales;
|
3172
|
+
const int * scales = (const int *) bxi->scales;
|
3153
3173
|
|
3154
3174
|
const int ksc = k % (WARP_SIZE/8);
|
3155
3175
|
|
@@ -3164,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3164
3184
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
3165
3185
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3166
3186
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3187
|
+
(void)x_qh;
|
3167
3188
|
|
3168
3189
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
3169
3190
|
|
@@ -3263,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
3263
3284
|
}
|
3264
3285
|
|
3265
3286
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3287
|
+
(void)x_qh;
|
3266
3288
|
|
3267
3289
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3268
3290
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
@@ -3276,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
|
3276
3298
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
3277
3299
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3278
3300
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3301
|
+
(void)x_qh;
|
3279
3302
|
|
3280
3303
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3281
3304
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3285,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3285
3308
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
3286
3309
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
3287
3310
|
|
3288
|
-
const block_q5_K * bx0 = (block_q5_K *) vx;
|
3311
|
+
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
3289
3312
|
|
3290
3313
|
#pragma unroll
|
3291
3314
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3341,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3341
3364
|
|
3342
3365
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
3343
3366
|
|
3344
|
-
const int * scales = (int *) bxi->scales;
|
3367
|
+
const int * scales = (const int *) bxi->scales;
|
3345
3368
|
|
3346
3369
|
const int ksc = k % (WARP_SIZE/8);
|
3347
3370
|
|
@@ -3356,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3356
3379
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
3357
3380
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3358
3381
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3382
|
+
(void)x_qh;
|
3359
3383
|
|
3360
3384
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
3361
3385
|
|
@@ -3392,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
3392
3416
|
}
|
3393
3417
|
|
3394
3418
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
3419
|
+
(void)x_qh;
|
3395
3420
|
|
3396
3421
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
3397
3422
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
@@ -3405,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
|
3405
3430
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
3406
3431
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
3407
3432
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
3433
|
+
(void)x_qh;
|
3408
3434
|
|
3409
3435
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
3410
3436
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
@@ -3414,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3414
3440
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
3415
3441
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
3416
3442
|
|
3417
|
-
const block_q6_K * bx0 = (block_q6_K *) vx;
|
3443
|
+
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
3418
3444
|
|
3419
3445
|
#pragma unroll
|
3420
3446
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
@@ -3476,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
|
3476
3502
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
3477
3503
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
3478
3504
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
3505
|
+
(void)x_qh;
|
3479
3506
|
|
3480
3507
|
const float * x_dmf = (const float *) x_dm;
|
3481
3508
|
const float * y_df = (const float *) y_ds;
|
@@ -3518,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
3518
3545
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
3519
3546
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
3520
3547
|
|
3521
|
-
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
3548
|
+
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
3522
3549
|
|
3523
3550
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
3524
3551
|
|
@@ -6023,18 +6050,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
6023
6050
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
6024
6051
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
6025
6052
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
6026
|
-
}
|
6053
|
+
}
|
6054
|
+
if (nb0 == ts) {
|
6027
6055
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
6028
|
-
} else {
|
6029
|
-
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6030
|
-
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6031
|
-
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6032
|
-
// pretend the row is a matrix with cols=1
|
6033
|
-
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6034
|
-
if (r != cudaSuccess) return r;
|
6035
|
-
}
|
6036
|
-
return cudaSuccess;
|
6037
6056
|
}
|
6057
|
+
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
6058
|
+
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
6059
|
+
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
6060
|
+
// pretend the row is a matrix with cols=1
|
6061
|
+
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
6062
|
+
if (r != cudaSuccess) { return r; }
|
6063
|
+
}
|
6064
|
+
return cudaSuccess;
|
6038
6065
|
}
|
6039
6066
|
|
6040
6067
|
static void ggml_cuda_op_repeat(
|
@@ -6989,7 +7016,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6989
7016
|
const int64_t ne01 = src0->ne[1];
|
6990
7017
|
const int64_t ne02 = src0->ne[2];
|
6991
7018
|
const int64_t ne03 = src0->ne[3];
|
6992
|
-
const int64_t nrows0 = ggml_nrows(src0);
|
7019
|
+
// const int64_t nrows0 = ggml_nrows(src0);
|
6993
7020
|
|
6994
7021
|
const int64_t ne10 = src1->ne[0];
|
6995
7022
|
const int64_t ne11 = src1->ne[1];
|
@@ -7090,7 +7117,7 @@ static void ggml_cuda_op_mul_mat(
|
|
7090
7117
|
if (src0_on_device && src0_is_contiguous) {
|
7091
7118
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
7092
7119
|
} else {
|
7093
|
-
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7120
|
+
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
7094
7121
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
7095
7122
|
}
|
7096
7123
|
|
@@ -7323,7 +7350,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|
7323
7350
|
}
|
7324
7351
|
|
7325
7352
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
7326
|
-
if (!g_cublas_loaded) return false;
|
7353
|
+
if (!g_cublas_loaded) { return false; }
|
7327
7354
|
|
7328
7355
|
const int64_t ne10 = src1->ne[0];
|
7329
7356
|
|
@@ -7401,7 +7428,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
7401
7428
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
7402
7429
|
}
|
7403
7430
|
|
7404
|
-
__global__ void k_compute_batched_ptrs(
|
7431
|
+
__global__ static void k_compute_batched_ptrs(
|
7405
7432
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
7406
7433
|
const void ** ptrs_src, void ** ptrs_dst,
|
7407
7434
|
int ne12, int ne13,
|
@@ -8017,7 +8044,7 @@ void ggml_cuda_free_scratch() {
|
|
8017
8044
|
}
|
8018
8045
|
|
8019
8046
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
8020
|
-
if (!g_cublas_loaded) return false;
|
8047
|
+
if (!g_cublas_loaded) { return false; }
|
8021
8048
|
|
8022
8049
|
ggml_cuda_func_t func;
|
8023
8050
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
@@ -8031,7 +8058,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
8031
8058
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
8032
8059
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
8033
8060
|
#ifndef NDEBUG
|
8034
|
-
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] =
|
8061
|
+
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
8035
8062
|
#endif
|
8036
8063
|
return false;
|
8037
8064
|
}
|
@@ -8316,14 +8343,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
|
8316
8343
|
UNUSED(cgraph);
|
8317
8344
|
}
|
8318
8345
|
|
8319
|
-
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8346
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8320
8347
|
GGML_ASSERT(!"not implemented");
|
8321
8348
|
|
8322
8349
|
UNUSED(backend);
|
8323
8350
|
UNUSED(plan);
|
8324
8351
|
}
|
8325
8352
|
|
8326
|
-
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8353
|
+
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
8327
8354
|
GGML_ASSERT(!"not implemented");
|
8328
8355
|
|
8329
8356
|
UNUSED(backend);
|
@@ -8339,8 +8366,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
8339
8366
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
8340
8367
|
ggml_tensor * node = cgraph->nodes[i];
|
8341
8368
|
|
8342
|
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
8369
|
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
8343
8370
|
continue;
|
8371
|
+
}
|
8344
8372
|
assert(node->backend == GGML_BACKEND_GPU);
|
8345
8373
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
8346
8374
|
if (node->src[j] != nullptr) {
|